Ejemplo n.º 1
0
def load_config(user_path: str, default_path: tuple) -> dict:
    """
    Quickly extract either configuration given by user and fall back to package default if no user
    config given.
    Args:
        user_path: path to YAML file
        default_path: tuple were to find the resource and name of resource
    Returns:
        dictionary built from YAML
    """
    logger = get_logger()
    stream = None
    if user_path is None:
        print(default_path[0], default_path[1])
        stream = resource_stream(default_path[0], default_path[1])
    else:
        if not exists(user_path):
            logger_string = f"The file {user_path} does not exist."
            logger.fatal(logger_string)
        stream = open(user_path)
    return yaml.load(stream, yaml.FullLoader)
class AnalyzerManager:
    """
    Manager class handling analysis and systematic objects
    """

    def __init__(self, ana_class, database, case, typean, doperiodbyperiod, *args):

        self.ana_class = ana_class
        self.database = database
        self.case = case
        self.typean = typean
        self.doperiodbyperiod = doperiodbyperiod

        # Additional arguments to be forwarded to the analyzers
        self.add_args = args

        self.logger = get_logger()

        self.analyzers = []
        self.after_burner = None

        self.is_initialized = False
Ejemplo n.º 3
0
def calc_sigeff_steps(num_steps, df_sig, name, multiclass_labels):
    logger = get_logger()
    if multiclass_labels is None:
        ns_left = int(num_steps / 10) - 1
        ns_right = num_steps - ns_left
        x_axis_left = np.linspace(0., 0.49, ns_left)
        x_axis_right = np.linspace(0.5, 1.0, ns_right)
        x_axis = np.concatenate((x_axis_left, x_axis_right))
    else:
        x_axis = np.linspace(0, 0.4, num_steps)
    if df_sig.empty:
        logger.error("In division denominator is empty")
        eff_array = [0] * num_steps
        eff_err_array = [0] * num_steps
        return eff_array, eff_err_array, x_axis
    num_tot_cand = len(df_sig)
    eff_array = []
    eff_err_array = []
    if multiclass_labels is not None:
        for thr0 in x_axis:
            for thr1 in x_axis:
                mlsel_multi0 = 'y_test_prob' + name + multiclass_labels[
                    0] + ' <= ' + str(thr0)
                mlsel_multi1 = 'y_test_prob' + name + multiclass_labels[
                    1] + ' >= ' + str(thr1)
                mlsel_multi = mlsel_multi0 + ' and ' + mlsel_multi1
                num_sel_cand = len(df_sig.query(mlsel_multi))
                eff, err_eff = calc_eff(num_sel_cand, num_tot_cand)
                eff_array.append(eff)
                eff_err_array.append(err_eff)
    else:
        for thr in x_axis:
            num_sel_cand = len(
                df_sig[df_sig['y_test_prob' + name].values >= thr])
            eff, err_eff = calc_eff(num_sel_cand, num_tot_cand)
            eff_array.append(eff)
            eff_err_array.append(err_eff)

    return eff_array, eff_err_array, x_axis
def calc_bkg(df_bkg, name, num_step, fit_region, bin_width, sig_region):
    """
    Estimate the number of background candidates under the signal peak. This is obtained
    from real data with a fit of the sidebands of the invariant mass distribution.
    """
    logger = get_logger()
    x_axis = np.linspace(0, 1.00, num_step)
    bkg_array = []
    bkg_err_array = []
    num_bins = (fit_region[1] - fit_region[0]) / bin_width
    num_bins = int(round(num_bins))
    bin_width = (fit_region[1] - fit_region[0]) / num_bins

    logger.debug("To fit the bkg an exponential function is used")
    for thr in x_axis:
        bkg = 0.
        bkg_err = 0.
        hmass = TH1F('hmass', '', num_bins, fit_region[0], fit_region[1])
        bkg_sel_mask = df_bkg['y_test_prob' + name].values >= thr
        sel_mass_array = df_bkg[bkg_sel_mask]['inv_mass_ML'].values

        if len(sel_mass_array) > 5:
            for mass_value in np.nditer(sel_mass_array):
                hmass.Fill(mass_value)

            fit = hmass.Fit('expo', 'Q', '', fit_region[0], fit_region[1])
            if int(fit) == 0:
                fit_func = hmass.GetFunction('expo')
                bkg = fit_func.Integral(sig_region[0],
                                        sig_region[1]) / bin_width
                bkg_err = fit_func.IntegralError(sig_region[0],
                                                 sig_region[1]) / bin_width
                del fit_func

        bkg_array.append(bkg)
        bkg_err_array.append(bkg_err)
        del hmass

    return bkg_array, bkg_err_array, x_axis
Ejemplo n.º 5
0
def calc_sigeff_steps(num_steps, df_sig, name):
    logger = get_logger()
    ns_left = int(num_steps / 10) - 1
    ns_right = num_steps - ns_left
    x_axis_left = np.linspace(0., 0.49, ns_left)
    x_axis_right = np.linspace(0.5, 1.0, ns_right)
    x_axis = np.concatenate((x_axis_left, x_axis_right))
    if df_sig.empty:
        logger.error("In division denominator is empty")
        eff_array = [0] * num_steps
        eff_err_array = [0] * num_steps
        return eff_array, eff_err_array, x_axis
    num_tot_cand = len(df_sig)
    eff_array = []
    eff_err_array = []
    for thr in x_axis:
        num_sel_cand = len(df_sig[df_sig['y_test_prob' + name].values >= thr])
        eff, err_eff = calc_eff(num_sel_cand, num_tot_cand)
        eff_array.append(eff)
        eff_err_array.append(err_eff)

    return eff_array, eff_err_array, x_axis
Ejemplo n.º 6
0
def getclf_xgboost(model_config):

    logger = get_logger()
    logger.debug("Load xgboost models")

    if "xgboost" not in model_config:
        logger.debug("No xgboost models found")
        return [], []

    classifiers = []
    names = []

    for c in model_config["xgboost"]:
        try:
            model = getattr(templates_xgboost, c)(model_config["xgboost"][c])
            classifiers.append(model)
            names.append(c)
            logger.info("Added xgboost model %s", c)
        except AttributeError:
            logger.critical("Could not load xgboost model %s", c)

    return classifiers, names
Ejemplo n.º 7
0
 def make_and_fill(self, binx, namex, biny=None, namey=None):
     """
     Makes histogram and fills them based on their axis titles
     """
     h = None
     if namey:
         # Check that column exists
         if namex not in self.source_dataframe:
             get_logger().warning(
                 "Columns %s for X axis does not exist in dataframe, skipping histogram",
                 namex)
             return
         if namey not in self.source_dataframe:
             get_logger().warning(
                 "Columns %s for Y axis does not exist in dataframe, skipping histogram",
                 namey)
             return
         h_name = f"hVal_{namex}_vs_{namey}{self.collection_tag}"
         h_tit = f" ; {namex} ; {namey}"
         h = makefill2dhist(self.source_dataframe, h_name, binx, biny,
                            namex, namey)
         h.SetTitle(h_tit)
     else:
         # Check that column exists
         if namex not in self.source_dataframe:
             get_logger().warning(
                 "Columns %s for X axis does not exist in dataframe, skipping histogram",
                 namex)
             return
         h_name = f"hVal_{namex}{self.collection_tag}"
         h_tit = f" ; {namex} ; Entries"
         h = makefill1dhist(self.source_dataframe, h_name, h_tit, binx,
                            namex)
     if self.verbose:
         get_logger().info("Filling histogram %s", h.GetName())
     self.histograms.append(h)
Ejemplo n.º 8
0
def prep_mlsamples(df_sig, df_bkg, namesig, nevt_sig, nevt_bkg, test_frac, rnd_splt):

    logger = get_logger()
    if nevt_sig > len(df_sig):
        logger.warning("There are not enough signal events")
    if nevt_bkg > len(df_bkg):
        logger.warning("There are not enough background events")

    nevt_sig = min(len(df_sig), nevt_sig)
    nevt_bkg = min(len(df_bkg), nevt_bkg)

    logger.info("Used number of signal events is %d", nevt_sig)
    logger.info("Used number of background events is %d", nevt_bkg)

    df_sig = df_sig[:nevt_sig]
    df_bkg = df_bkg[:nevt_bkg]
    df_sig[namesig] = 1
    df_bkg[namesig] = 0
    df_ml = pd.DataFrame()
    df_ml = pd.concat([df_sig, df_bkg])
    df_ml_train, df_ml_test = train_test_split(df_ml, test_size=test_frac, random_state=rnd_splt)

    logger.info("%d events for training and %d for testing", len(df_ml_train), len(df_ml_test))
    return df_ml_train, df_ml_test
Ejemplo n.º 9
0
def do_gridsearch(names, classifiers, param_grid, refit_arr, x_train, y_train_,
                  cv_, ncores):
    logger = get_logger()
    grid_search_models_ = []
    grid_search_bests_ = []
    list_scores_ = []
    for _, clf, param_cv, refit in zip(names, classifiers, param_grid,
                                       refit_arr):
        grid_search = GridSearchCV(clf,
                                   param_cv,
                                   cv=cv_,
                                   refit=refit,
                                   scoring='neg_mean_squared_error',
                                   n_jobs=ncores)
        grid_search_model = grid_search.fit(x_train, y_train_)
        cvres = grid_search.cv_results_
        for mean_score, params in zip(cvres["mean_test_score"],
                                      cvres["params"]):
            logger.info(np.sqrt(-mean_score), params)
        list_scores_.append(pd.DataFrame(cvres))
        grid_search_best = grid_search.best_estimator_.fit(x_train, y_train_)
        grid_search_models_.append(grid_search_model)
        grid_search_bests_.append(grid_search_best)
    return grid_search_models_, grid_search_bests_, list_scores_
Ejemplo n.º 10
0
def create_mlsamples(
        df_sig,
        df_bkg,
        sel_opt_sig,
        main_dict,
        sel_bkg,
        rnd_shuffle,  # pylint: disable=too-many-arguments
        var_signal,
        var_training,
        nevt_sig,
        nevt_bkg,
        test_frac,
        rnd_splt):
    df_sig = filter_df_cand(df_sig, main_dict, sel_opt_sig)
    df_bkg = df_bkg.query(sel_bkg)
    df_sig = shuffle(df_sig, random_state=rnd_shuffle)
    df_bkg = shuffle(df_bkg, random_state=rnd_shuffle)
    df_ml_train, df_ml_test = prep_mlsamples(df_sig, df_bkg, var_signal,
                                             nevt_sig, nevt_bkg, test_frac,
                                             rnd_splt)
    df_sig_train, df_bkg_train = split_df_sigbkg(df_ml_train, var_signal)
    df_sig_test, df_bkg_test = split_df_sigbkg(df_ml_test, var_signal)
    logger = get_logger()
    logger.info("Events for ml train %d and test %d", len(df_ml_train),
                len(df_ml_test))
    logger.info("Events for signal train %d and test %d", len(df_sig_train),
                len(df_sig_test))
    logger.info("Events for bkg train %d and test %d", len(df_bkg_train),
                len(df_bkg_test))
    x_train = df_ml_train[var_training]
    y_train = df_ml_train[var_signal]
    x_test = df_ml_test[var_training]
    y_test = df_ml_test[var_signal]

    return df_ml_train, df_ml_test, df_sig_train, df_bkg_train, df_sig_test, df_bkg_test, \
        x_train, y_train, x_test, y_test
Ejemplo n.º 11
0
    def assert_model_config(self):  # pylint: disable=R0912
        """
        Validate and return the configuration for ml models
        Args:
            path: path to configuration YAML
            run_config: Run configuration since loading some models can depend on that, e.g.
                        if run_config["activate_keras"] == 0 the keras config does not need
                        to be checked and loaded.
        """
        logger = get_logger()
        logger.debug("Check sanity of user configs")

        user_config = {}
        if isinstance(self.model_config_input, str):
            user_config = parse_yaml(
                os.path.expanduser(self.model_config_input))
        elif isinstance(self.model_config_input, dict):
            user_config = self.model_config_input

        # At this point the asserted_config dict is just the one with defaults
        asserted_config = Configuration.get_meta_config("models")[
            self.run_config["mltype"]]
        user_config = user_config.get(self.run_config["mltype"], {})

        # Could probably merged with the former loop, however, would like to see whether there are
        # e.g. typos. Because steering a run wanting keras - but writing kras - could cost a lot of
        # time when it needs to be done again.
        if self.run_config["mltype"] in self.run_config["activate_models"]:
            for backend, model in \
            self.run_config["activate_models"][self.run_config["mltype"]].items():
                if backend not in asserted_config:
                    logger.critical("Unknown backend %s.", backend)
                if model is None:
                    logger.critical("No models specified for backend %s.",
                                    backend)
                for name, activate in model.items():
                    if name not in asserted_config[backend]:
                        logger.critical("Unknown model %s for backend %s.",
                                        name, backend)
                    if name in asserted_config[backend]:
                        if activate is None or not isinstance(activate, bool):
                            logger.critical("Activation value of model %s for backend %s " \
                                             "must be specified as boolean value.", name, backend)
                        asserted_config[backend][name]["activate"] = activate

        # Pop deactivated models
        for backend in list(asserted_config.keys()):
            for model in list(asserted_config[backend].keys()):
                if not asserted_config[backend][model]["activate"]:
                    del asserted_config[backend][model]
                else:
                    asserted_config[backend][model] = asserted_config[backend][
                        model]["default"]
                    if backend in user_config and model in user_config[backend]:
                        if len(user_config[backend][model]) != len(
                                asserted_config[backend][model]):
                            logger.critical(
                                "Parameter list for %s model %s differs",
                                backend, model)
                        for u in asserted_config[backend][model]:
                            asserted_config[backend][model][u] = \
                                user_config[backend][model].get(u,
                                                                asserted_config[backend][model][u])

        self.model_config = asserted_config
from machine_learning_hep.models import fit, savemodels, test, apply, decisionboundaries
from machine_learning_hep.models import importanceplotall
from machine_learning_hep.mlperformance import cross_validation_mse, cross_validation_mse_continuous
from machine_learning_hep.mlperformance import plot_cross_validation_mse, plot_learning_curves
# from machine_learning_hep.mlperformance import confusion, plot_overtraining
from machine_learning_hep.mlperformance import precision_recall
from machine_learning_hep.grid_search import do_gridsearch, read_grid_dict, perform_plot_gridsearch
from machine_learning_hep.logger import get_logger
from machine_learning_hep.optimization import study_signif
from machine_learning_hep.efficiency import study_eff
DATA_PREFIX = os.path.expanduser("~/.machine_learning_hep")


def doclassification_regression(run_config, data, model_config, case, binmin,
                                binmax):  # pylint: disable=too-many-locals, too-many-statements, too-many-branches
    logger = get_logger()
    logger.info("Start classification_regression run")

    mltype = run_config['mltype']
    mlsubtype = run_config['mlsubtype']
    loadsampleoption = run_config['loadsampleoption']
    rnd_shuffle = run_config['rnd_shuffle']
    nevt_sig = run_config['nevt_sig']
    nevt_bkg = run_config['nevt_bkg']
    test_frac = run_config['test_frac']
    rnd_splt = run_config['rnd_splt']
    docorrelation = run_config['docorrelation']
    dostandard = run_config['dostandard']
    dopca = run_config['dopca']
    dotraining = run_config['dotraining']
    dotesting = run_config['dotesting']
Ejemplo n.º 13
0
    def assert_run_config(self):
        """
        Validate and return the configuration for run
        Args:
            path: path to configuration YAML
        """
        logger = get_logger()
        logger.debug("Check sanity of user configs")

        user_run_config = {}
        if isinstance(self.run_config_input, str):
            user_run_config = parse_yaml(
                os.path.expanduser(self.run_config_input))
        elif isinstance(self.run_config_input, dict):
            user_run_config = self.run_config_input

        # At this point the asserted_config dict is just the one with defaults
        run_config = Configuration.get_meta_config("run")
        asserted_config = {k: run_config[k]["default"] for k in run_config}
        choices_config = {
            k: run_config[k]["choices"]
            for k in run_config if "choices" in run_config[k]
        }
        depends_config = {
            k: run_config[k]["depends"]
            for k in run_config if "depends" in run_config[k]
        }
        types_config = {
            k: run_config[k]["type_as"]
            for k in run_config if "type_as" in run_config[k]
        }
        # Check for unknown parameters and abort since running entire machinery with wrong
        # setting (e.g. 'dotaining' instead of 'dotraining' might happen just by accident)
        # could be just overhead.
        for k in user_run_config:
            if k not in asserted_config:
                logger.critical("Unkown parameter %s in config", k)
            elif user_run_config[k] is None:
                logger.critical("Missing value for parameter %s in config", k)

        # Replace all defaults if user specified parameter
        for k in asserted_config:
            asserted_config[k] = user_run_config.get(k, asserted_config[k])
            # If parameter is already set, check if consistent
            if k in choices_config and asserted_config[
                    k] not in choices_config[k]:
                logger.critical(
                    "Invalid value %s for parameter %s. Must be one of %s",
                    str(user_run_config[k]), k, str(choices_config[k]))
            if k in types_config:
                check_types = [type(t) for t in types_config[k]]
                if not isinstance(asserted_config[k], tuple(check_types)):
                    logger.critical(
                        "Invalid value type %s of parameter %s. Must be of type %s",
                        str(type(asserted_config[k])), k, str(check_types))

        # Can so far only depend on one parameter, change to combination
        # of parameters. Do we need to check for circular dependencies?
        for k in depends_config:
            if (asserted_config[depends_config[k]["parameter"]]
                    == depends_config[k]["value"]
                    and asserted_config[k] != depends_config[k]["set"]):
                asserted_config[k] = depends_config[k]["set"]
                logger.info(
                    "Parameter %s = %s enforced since it is required for %s == %s",
                    k, str(depends_config[k]["set"]),
                    str(depends_config[k]["parameter"]),
                    str(depends_config[k]["value"]))

        self.run_config = asserted_config
Ejemplo n.º 14
0
 def make_asymm_y_errors(*args):
     if len(args) % 2 != 0:
         get_logger().fatal(
             "Need an even number ==> ((low, up) * n_central) of errors")
     return [[0, 0, args[i], args[i + 1]] for i in range(0, len(args), 2)]
Ejemplo n.º 15
0
def signal_func(sgnfunc):
    if sgnfunc != "kGaus":
        get_logger().fatal("Unknown signal fit function %s", sgnfunc)
    return "[0]/(sqrt(2.*pi))/[2]*(exp(-(x-[1])*(x-[1])/2./[2]/[2]))"
Ejemplo n.º 16
0
def calc_systematic_mesonratio(errnum_list, errden_list, n_bins, justfd=-99):
    """
    Returns a list of total errors taking into account the defined correlations
    Propagation uncertainties defined for Ds(MB or mult) / D0(MB or mult).
    Check if applicable to your situation
    """
    tot_list = [[0., 0., 0., 0.] for _ in range(n_bins)]
    if n_bins != len(list(errnum_list.errors.values())[0]) or \
     n_bins != len(list(errden_list.errors.values())[0]):
        get_logger().fatal("Number of bins and number of errors mismatch, %i vs. %i vs. %i", \
                            n_bins, len(list(errnum_list.errors.values())[0]), \
                            len(list(errden_list.errors.values())[0]))

    listimpl = ["yield", "cut", "pid", "feeddown_mult", "feeddown_mult_spectra", "trigger", \
                "multiplicity_interval", "multiplicity_weights", "track", "ptshape", \
                "feeddown_NB", "sigmav0", "branching_ratio"]

    j = 0
    for (_, errnum), (_, errden) in zip(errnum_list.errors.items(),
                                        errden_list.errors.items()):
        for i in range(n_bins):

            if errnum_list.names[j] not in listimpl:
                get_logger().fatal("Unknown systematic name: %s",
                                   errnum_list.names[j])
            if errnum_list.names[j] != errden_list.names[j]:
                get_logger().fatal("Names not in same order: %s vs %s", \
                                   errnum_list.names[j], errden_list.names[j])

            for nb in range(len(tot_list[i])):
                if errnum_list.names[j] == "yield" and justfd is not True:
                    #Uncorrelated
                    tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[
                        i][nb] * errden[i][nb]
                elif errnum_list.names[j] == "cut" and justfd is not True:
                    #Uncorrelated
                    tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[
                        i][nb] * errden[i][nb]
                elif errnum_list.names[j] == "pid" and justfd is not True:
                    #Correlated, assign difference
                    diff = abs(errnum[i][nb] - errden[i][nb])
                    tot_list[i][nb] += diff * diff
                elif errnum_list.names[
                        j] == "feeddown_mult_spectra" and justfd is not False:
                    #Fully correlated
                    ynum = errnum_list.errors["feeddown_NB"][i][4]
                    yden = errden_list.errors["feeddown_NB"][i][4]
                    #Relative uncertainties stored, make absolute
                    ynuml = ynum - ynum * errnum[i][2]
                    ydenl = yden - yden * errden[i][2]
                    ynumh = ynum + ynum * errnum[i][3]
                    ydenh = yden + yden * errden[i][3]
                    rat = [ynuml / ydenl, ynum / yden, ynumh / ydenh]
                    minsys = min(rat)
                    maxsys = max(rat)
                    if nb == 2:
                        tot_list[i][nb] += (rat[1] - minsys) * (
                            rat[1] - minsys) / (rat[1] * rat[1])
                    if nb == 3:
                        tot_list[i][nb] += (maxsys - rat[1]) * (
                            maxsys - rat[1]) / (rat[1] * rat[1])
                elif errnum_list.names[
                        j] == "feeddown_mult" and justfd is not False:
                    #Spectra here, skip ratio systematic
                    pass
                elif errnum_list.names[j] == "trigger" and justfd is not True:
                    #Correlated, do nothing
                    pass
                elif errnum_list.names[
                        j] == "feeddown_NB" and justfd is not False:
                    #Fully correlated under assumption central Fc value stays within Nb syst
                    ynum = errnum[i][4]
                    yden = errden[i][4]
                    #Absolute uncertainties stored
                    ynuml = ynum - errnum[i][2]
                    ydenl = yden - errden[i][2]
                    ynumh = ynum + errnum[i][3]
                    ydenh = yden + errden[i][3]
                    rat = [ynuml / ydenl, ynum / yden, ynumh / ydenh]
                    minsys = min(rat)
                    maxsys = max(rat)
                    if nb == 2:
                        tot_list[i][nb] += (rat[1] - minsys) * (
                            rat[1] - minsys) / (rat[1] * rat[1])
                    if nb == 3:
                        tot_list[i][nb] += (maxsys - rat[1]) * (
                            maxsys - rat[1]) / (rat[1] * rat[1])
                elif errnum_list.names[
                        j] == "multiplicity_weights" and justfd is not True:
                    #Correlated, assign difference
                    diff = abs(errnum[i][nb] - errden[i][nb])
                    tot_list[i][nb] += diff * diff
                elif errnum_list.names[j] == "track" and justfd is not True:
                    #Correlated, assign difference
                    diff = abs(errnum[i][nb] - errden[i][nb])
                    tot_list[i][nb] += diff * diff
                elif errnum_list.names[j] == "ptshape" and justfd is not True:
                    #Uncorrelated
                    tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[
                        i][nb] * errden[i][nb]
                elif errnum_list.names[
                        j] == "multiplicity_interval" and justfd is not True:
                    #NB: Assuming ratio: 3prongs over 2prongs here! 2prong part cancels
                    #We use 1/3 of systematic of numerator
                    tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] / 9
                elif errnum_list.names[j] == "sigmav0" and justfd is not True:
                    #Correlated and usually not plotted in boxes, do nothing
                    pass
                elif errnum_list.names[
                        j] == "branching_ratio" and justfd is not True:
                    #Uncorrelated, but usually not plotted in boxes, so pass
                    pass
        j = j + 1
    tot_list = np.sqrt(tot_list)
    return tot_list
Ejemplo n.º 17
0
def calc_systematic_mesondoubleratio(errnum_list1, errnum_list2, errden_list1, \
                                     errden_list2, n_bins, dropbins=None, justfd=-99):
    """
    Returns a list of total errors taking into account the defined correlations
    Propagation uncertainties defined for Lc/D0_mult-i / Lc/D0_mult-j.
    Check if applicable to your situation
    """
    tot_list = [[0., 0., 0., 0.] for _ in range(n_bins)]
    if n_bins != len(list(errnum_list1.errors.values())[0]) or \
     n_bins != len(list(errden_list1.errors.values())[0]):
        if dropbins is None:
            get_logger().fatal("Number of bins and number of errors mismatch, %i vs. %i vs. %i", \
                                n_bins, len(list(errnum_list1.errors.values())[0]), \
                                len(list(errden_list1.errors.values())[0]))

    listimpl = ["yield", "cut", "pid", "feeddown_mult", "feeddown_mult_spectra", "trigger", \
                "multiplicity_interval", "multiplicity_weights", "track", "ptshape", \
                "feeddown_NB", "sigmav0", "branching_ratio"]

    j = 0
    for (_, errnum1), (_, errnum2), (_, errden1), (_, errden2) in zip(errnum_list1.errors.items(), \
                                                                      errnum_list2.errors.items(), \
                                                                      errden_list1.errors.items(), \
                                                                      errden_list2.errors.items()):
        for i in range(n_bins):

            inum = i
            iden = i
            if dropbins is not None:
                inum = dropbins[0][i]
                iden = dropbins[1][i]

            if errnum_list1.names[j] not in listimpl:
                get_logger().fatal("Unknown systematic name: %s",
                                   errnum_list1.names[j])
            if errnum_list1.names[j] != errden_list2.names[j]:
                get_logger().fatal("Names not in same order: %s vs %s", \
                                   errnum_list1.names[j], errden_list2.names[j])

            for nb in range(len(tot_list[i])):
                if errnum_list1.names[j] == "yield" and justfd is not True:
                    #Uncorrelated
                    tot_list[i][nb] += errnum1[inum][nb] * errnum1[inum][nb] + \
                                       errnum2[inum][nb] * errnum2[inum][nb] + \
                                       errden1[iden][nb] * errden1[iden][nb] + \
                                       errden2[iden][nb] * errden2[iden][nb]
                elif errnum_list1.names[j] == "cut" and justfd is not True:
                    #Uncorrelated
                    tot_list[i][nb] += errnum1[inum][nb] * errnum1[inum][nb] + \
                                       errnum2[inum][nb] * errnum2[inum][nb] + \
                                       errden1[iden][nb] * errden1[iden][nb] + \
                                       errden2[iden][nb] * errden2[iden][nb]
                elif errnum_list1.names[j] == "pid" and justfd is not True:
                    #Correlated, do nothing
                    pass
                elif errnum_list1.names[
                        j] == "feeddown_mult_spectra" and justfd is not False:
                    #Correlated, do nothing
                    pass
                elif errnum_list1.names[
                        j] == "feeddown_mult" and justfd is not False:
                    #Correlated, do nothing
                    pass
                elif errnum_list1.names[j] == "trigger" and justfd is not True:
                    #Correlated, do nothing
                    pass
                elif errnum_list1.names[
                        j] == "feeddown_NB" and justfd is not False:
                    #Correlated, do nothing
                    pass
                elif errnum_list1.names[
                        j] == "multiplicity_weights" and justfd is not True:
                    #Correlated, do nothing
                    pass
                elif errnum_list1.names[j] == "track" and justfd is not True:
                    #Correlated, do nothing
                    pass
                elif errnum_list1.names[j] == "ptshape" and justfd is not True:
                    #Uncorrelated
                    tot_list[i][nb] += errnum1[inum][nb] * errnum1[inum][nb] + \
                                       errnum2[inum][nb] * errnum2[inum][nb] + \
                                       errden1[iden][nb] * errden1[iden][nb] + \
                                       errden2[iden][nb] * errden2[iden][nb]
                elif errnum_list1.names[
                        j] == "multiplicity_interval" and justfd is not True:
                    #NB: Assuming ratio: 3prongs over 2prongs here! 2prong part cancels
                    #We use 1/3 of systematic of numerator
                    tot_list[i][
                        nb] += errden1[iden][nb] * errden1[iden][nb] / 9
                elif errnum_list1.names[j] == "sigmav0" and justfd is not True:
                    #Correlated and usually not plotted in boxes, do nothing
                    pass
                elif errnum_list1.names[
                        j] == "branching_ratio" and justfd is not True:
                    #Uncorrelated, but usually not plotted in boxes, so pass
                    pass
        j = j + 1
    tot_list = np.sqrt(tot_list)
    return tot_list
Ejemplo n.º 18
0
    def __init__(self, data_param, case):
        self.logger = get_logger()
        self.logger.info("DnnOptimizer::Init\nCase: %s", case)

        # Dataset config
        self.grid_phi = data_param["grid_phi"]
        self.grid_z = data_param["grid_z"]
        self.grid_r = data_param["grid_r"]

        self.selopt_input = data_param["selopt_input"]
        self.selopt_output = data_param["selopt_output"]
        self.opt_train = data_param["opt_train"]
        self.opt_predout = data_param["opt_predout"]
        self.nameopt_predout = data_param["nameopt_predout"]
        self.dim_input = sum(self.opt_train)
        self.dim_output = sum(self.opt_predout)
        self.use_scaler = data_param["use_scaler"]

        # Directories
        self.dirmodel = data_param["dirmodel"]
        self.dirval = data_param["dirval"]
        self.diroutflattree = data_param["diroutflattree"]
        train_dir = data_param["dirinput_bias"] if data_param["train_bias"] \
                    else data_param["dirinput_nobias"]
        test_dir = data_param["dirinput_bias"] if data_param["test_bias"] \
                    else data_param["dirinput_nobias"]
        apply_dir = data_param["dirinput_bias"] if data_param["apply_bias"] \
                    else data_param["dirinput_nobias"]
        self.dirinput_train = "%s/SC-%d-%d-%d/" % \
                              (train_dir, self.grid_z, self.grid_r, self.grid_phi)
        self.dirinput_test = "%s/SC-%d-%d-%d/" % \
                             (test_dir, self.grid_z, self.grid_r, self.grid_phi)
        self.dirinput_apply = "%s/SC-%d-%d-%d/" % \
                              (apply_dir, self.grid_z, self.grid_r, self.grid_phi)

        # DNN config
        self.filters = data_param["filters"]
        self.pooling = data_param["pooling"]
        self.batch_size = data_param["batch_size"]
        self.shuffle = data_param["shuffle"]
        self.depth = data_param["depth"]
        self.batch_normalization = data_param["batch_normalization"]
        self.dropout = data_param["dropout"]
        self.epochs = data_param["epochs"]
        self.lossfun = data_param["lossfun"]
        self.metrics = data_param["metrics"]
        self.adamlr = data_param["adamlr"]

        self.params = {'phi_slice': self.grid_phi,
                       'r_row' : self.grid_r,
                       'z_col' : self.grid_z,
                       'batch_size': self.batch_size,
                       'shuffle': self.shuffle,
                       'opt_train' : self.opt_train,
                       'opt_predout' : self.opt_predout,
                       'selopt_input' : self.selopt_input,
                       'selopt_output' : self.selopt_output,
                       'use_scaler': self.use_scaler}

        self.suffix = "phi%d_r%d_z%d_filter%d_poo%d_drop%.2f_depth%d_batch%d_scaler%d" % \
                (self.grid_phi, self.grid_r, self.grid_z, self.filters, self.pooling,
                 self.dropout, self.depth, self.batch_normalization, self.use_scaler)
        self.suffix = "%s_useSCMean%d_useSCFluc%d" % \
                (self.suffix, self.opt_train[0], self.opt_train[1])
        self.suffix = "%s_pred_doR%d_dophi%d_doz%d" % \
                (self.suffix, self.opt_predout[0], self.opt_predout[1], self.opt_predout[2])
        self.suffix_ds = "phi%d_r%d_z%d" % \
                (self.grid_phi, self.grid_r, self.grid_z)

        if not os.path.isdir("plots"):
            os.makedirs("plots")

        if not os.path.isdir(self.dirmodel):
            os.makedirs(self.dirmodel)

        if not os.path.isdir(self.dirval):
            os.makedirs(self.dirval)

        self.logger.info("I am processing the configuration %s", self.suffix)
        if self.dim_output > 1:
            self.logger.fatal("YOU CAN PREDICT ONLY 1 DISTORSION. The sum of opt_predout == 1")
        self.logger.info("Inputs active for training: (SCMean, SCFluctuations)=(%d, %d)",
                         self.opt_train[0], self.opt_train[1])

        self.maxrandomfiles = data_param["maxrandomfiles"]
        self.range_mean_index = data_param["range_mean_index"]
        self.indices_events_means_train = None
        self.partition = None
        self.total_events = 0

        gROOT.SetStyle("Plain")
        gROOT.SetBatch()
Ejemplo n.º 19
0
def calc_systematic_multovermb(errnum_list, errden_list, n_bins, justfd=-99):
    """
    Returns a list of total errors taking into account the defined correlations
    Propagation uncertainties defined for Ds(mult) / Ds(MB). Check if applicable to your situation
    """
    tot_list = [[0., 0., 0., 0.] for _ in range(n_bins)]
    if n_bins != len(list(errnum_list.errors.values())[0]) or \
     n_bins != len(list(errden_list.errors.values())[0]):
        get_logger().fatal("Number of bins and number of errors mismatch, %i vs. %i vs. %i", \
                            n_bins, len(list(errnum_list.errors.values())[0]), \
                            len(list(errden_list.errors.values())[0]))

    listimpl = ["yield", "cut", "pid", "feeddown_mult", "feeddown_mult_spectra", "trigger", \
                "multiplicity_interval", "multiplicity_weights", "track", "ptshape", \
                "feeddown_NB", "sigmav0", "branching_ratio"]

    j = 0
    for (_, errnum), (_, errden) in zip(errnum_list.errors.items(),
                                        errden_list.errors.items()):
        for i in range(n_bins):

            if errnum_list.names[j] not in listimpl:
                get_logger().fatal("Unknown systematic name: %s",
                                   errnum_list.names[j])
            if errnum_list.names[j] != errden_list.names[j]:
                get_logger().fatal("Names not in same order: %s vs %s", \
                                   errnum.names[j], errden.names[j])

            for nb in range(len(tot_list[i])):
                if errnum_list.names[j] == "yield" and justfd is not True:
                    #Partially correlated, take largest
                    tot_list[i][nb] += max(errnum[i][nb], errden[i][nb]) \
                                        * max(errnum[i][nb], errden[i][nb])
                elif errnum_list.names[j] == "cut" and justfd is not True:
                    #Partially correlated, take largest
                    tot_list[i][nb] += max(errnum[i][nb], errden[i][nb]) \
                                        * max(errnum[i][nb], errden[i][nb])
                elif errnum_list.names[j] == "pid" and justfd is not True:
                    #Correlated, do nothing
                    pass
                elif errnum_list.names[
                        j] == "feeddown_mult" and justfd is not False:
                    #Assign directly from multiplicity case, no syst for MB
                    tot_list[i][nb] += errnum[i][nb] * errnum[i][nb]
                elif errnum_list.names[
                        j] == "feeddown_mult_spectra" and justfd is not False:
                    #Ratio here, skip spectra syst
                    pass
                elif errnum_list.names[j] == "trigger" and justfd is not True:
                    #Assign directly from multiplicity case, no syst for MB
                    tot_list[i][nb] += errnum[i][nb] * errnum[i][nb]
                elif errnum_list.names[
                        j] == "multiplicity_interval" and justfd is not True:
                    #FD: estimated using 7TeV strategy directly for ratio
                    tot_list[i][nb] += errnum[i][nb] * errnum[i][nb]
                elif errnum_list.names[
                        j] == "multiplicity_weights" and justfd is not True:
                    #Uncorrelated
                    tot_list[i][nb] += errnum[i][nb] * errnum[i][nb] + errden[
                        i][nb] * errden[i][nb]
                elif errnum_list.names[j] == "track" and justfd is not True:
                    #Correlated, do nothing
                    pass
                elif errnum_list.names[j] == "ptshape" and justfd is not True:
                    #Correlated, assign difference
                    diff = abs(errnum[i][nb] - errden[i][nb])
                    tot_list[i][nb] += diff * diff
                elif errnum_list.names[
                        j] == "feeddown_NB" and justfd is not False:
                    #Correlated, do nothing
                    pass
                elif errnum_list.names[j] == "sigmav0" and justfd is not True:
                    #Correlated and usually not plotted in boxes, do nothing
                    pass
                elif errnum_list.names[
                        j] == "branching_ratio" and justfd is not True:
                    #Correlated and usually not plotted in boxes, do nothing
                    pass
        j = j + 1
    tot_list = np.sqrt(tot_list)
    return tot_list
def main():  # pylint: disable=too-many-locals, too-many-statements, too-many-branches
    """
    Main plotting function
    """
    gROOT.SetBatch(True)

    # pylint: disable=unused-variable

    parser = argparse.ArgumentParser()
    parser.add_argument("--database-analysis",
                        "-d",
                        dest="database_analysis",
                        help="analysis database to be used",
                        required=True)
    parser.add_argument("--analysis",
                        "-a",
                        dest="type_ana",
                        help="choose type of analysis",
                        required=True)
    parser.add_argument("--input",
                        "-i",
                        dest="input_file",
                        help="results input file",
                        required=True)

    args = parser.parse_args()

    typean = args.type_ana
    shape = typean[len("jet_"):]
    print("Shape:", shape)

    file_in = args.input_file
    with open(args.database_analysis, "r") as file_db:
        data_param = yaml.safe_load(file_db)
    case = list(data_param.keys())[0]
    datap = data_param[case]

    logger = get_logger()

    i_cut = file_in.rfind("/")
    rootpath = file_in[:i_cut]

    # plotting
    # LaTeX string
    p_latexnhadron = datap["analysis"][typean]["latexnamehadron"]
    p_latexbin2var = datap["analysis"][typean]["latexbin2var"]
    v_varshape_latex = datap["analysis"][typean]["var_shape_latex"]

    # first variable (hadron pt)
    lpt_finbinmin = datap["analysis"][typean]["sel_an_binmin"]
    lpt_finbinmax = datap["analysis"][typean]["sel_an_binmax"]
    var1ranges = lpt_finbinmin.copy()
    var1ranges.append(lpt_finbinmax[-1])

    # second variable (jet pt)
    v_var2_binning = datap["analysis"][typean]["var_binning2"]  # name
    lvar2_binmin_reco = datap["analysis"][typean].get("sel_binmin2_reco", None)
    lvar2_binmax_reco = datap["analysis"][typean].get("sel_binmax2_reco", None)
    p_nbin2_reco = len(lvar2_binmin_reco)  # number of reco bins
    lvar2_binmin_gen = datap["analysis"][typean].get("sel_binmin2_gen", None)
    lvar2_binmax_gen = datap["analysis"][typean].get("sel_binmax2_gen", None)
    p_nbin2_gen = len(lvar2_binmin_gen)  # number of gen bins
    var2ranges_reco = lvar2_binmin_reco.copy()
    var2ranges_reco.append(lvar2_binmax_reco[-1])
    var2binarray_reco = array(
        "d",
        var2ranges_reco)  # array of bin edges to use in histogram constructors
    var2ranges_gen = lvar2_binmin_gen.copy()
    var2ranges_gen.append(lvar2_binmax_gen[-1])
    var2binarray_gen = array(
        "d",
        var2ranges_gen)  # array of bin edges to use in histogram constructors

    # observable (z, shape,...)
    v_varshape_binning = datap["analysis"][typean][
        "var_binningshape"]  # name (reco)
    v_varshape_binning_gen = datap["analysis"][typean][
        "var_binningshape_gen"]  # name (gen)
    lvarshape_binmin_reco = \
        datap["analysis"][typean].get("sel_binminshape_reco", None)
    lvarshape_binmax_reco = \
        datap["analysis"][typean].get("sel_binmaxshape_reco", None)
    p_nbinshape_reco = len(lvarshape_binmin_reco)  # number of reco bins
    lvarshape_binmin_gen = \
        datap["analysis"][typean].get("sel_binminshape_gen", None)
    lvarshape_binmax_gen = \
        datap["analysis"][typean].get("sel_binmaxshape_gen", None)
    p_nbinshape_gen = len(lvarshape_binmin_gen)  # number of gen bins
    varshaperanges_reco = lvarshape_binmin_reco.copy()
    varshaperanges_reco.append(lvarshape_binmax_reco[-1])
    varshapebinarray_reco = array(
        "d", varshaperanges_reco
    )  # array of bin edges to use in histogram constructors
    varshaperanges_gen = lvarshape_binmin_gen.copy()
    varshaperanges_gen.append(lvarshape_binmax_gen[-1])
    varshapebinarray_gen = array(
        "d", varshaperanges_gen
    )  # array of bin edges to use in histogram constructors

    file_results = TFile.Open(file_in)
    if not file_results:
        logger.fatal(make_message_notfound(file_in))

    ibin2 = 1

    suffix = "%s_%g_%g" % (v_var2_binning, lvar2_binmin_gen[ibin2],
                           lvar2_binmax_gen[ibin2])

    # HF data
    nameobj = "%s_hf_data_%d_stat" % (shape, ibin2)
    hf_data_stat = file_results.Get(nameobj)
    if not hf_data_stat:
        logger.fatal(make_message_notfound(nameobj, file_in))
    nameobj = "%s_hf_data_%d_syst" % (shape, ibin2)
    hf_data_syst = file_results.Get(nameobj)
    if not hf_data_syst:
        logger.fatal(make_message_notfound(nameobj, file_in))

    # HF PYTHIA
    nameobj = "%s_hf_pythia_%d_stat" % (shape, ibin2)
    hf_pythia_stat = file_results.Get(nameobj)
    if not hf_pythia_stat:
        logger.fatal(make_message_notfound(nameobj, file_in))

    # HF ratio
    nameobj = "%s_hf_ratio_%d_stat" % (shape, ibin2)
    hf_ratio_stat = file_results.Get(nameobj)
    if not hf_ratio_stat:
        logger.fatal(make_message_notfound(nameobj, file_in))
    nameobj = "%s_hf_ratio_%d_syst" % (shape, ibin2)
    hf_ratio_syst = file_results.Get(nameobj)
    if not hf_ratio_syst:
        logger.fatal(make_message_notfound(nameobj, file_in))

    # inclusive data
    nameobj = "%s_incl_data_%d_stat" % (shape, ibin2)
    incl_data_stat = file_results.Get(nameobj)
    if not incl_data_stat:
        logger.fatal(make_message_notfound(nameobj, file_in))
    nameobj = "%s_incl_data_%d_syst" % (shape, ibin2)
    incl_data_syst = file_results.Get(nameobj)
    if not incl_data_syst:
        logger.fatal(make_message_notfound(nameobj, file_in))

    # inclusive PYTHIA
    nameobj = "%s_incl_pythia_%d_stat" % (shape, ibin2)
    incl_pythia_stat = file_results.Get(nameobj)
    if not incl_pythia_stat:
        logger.fatal(make_message_notfound(nameobj, file_in))
    nameobj = "%s_incl_pythia_%d_syst" % (shape, ibin2)
    incl_pythia_syst = file_results.Get(nameobj)
    if not incl_pythia_syst:
        logger.fatal(make_message_notfound(nameobj, file_in))

    # inclusive ratio
    nameobj = "%s_incl_ratio_%d_stat" % (shape, ibin2)
    incl_ratio_stat = file_results.Get(nameobj)
    if not incl_ratio_stat:
        logger.fatal(make_message_notfound(nameobj, file_in))
    nameobj = "%s_incl_ratio_%d_syst" % (shape, ibin2)
    incl_ratio_syst = file_results.Get(nameobj)
    if not incl_ratio_syst:
        logger.fatal(make_message_notfound(nameobj, file_in))

    # quark PYTHIA
    nameobj = "%s_quark_pythia_%d_stat" % (shape, ibin2)
    quark_pythia_stat = file_results.Get(nameobj)
    if not quark_pythia_stat:
        logger.fatal(make_message_notfound(nameobj, file_in))
    nameobj = "%s_quark_pythia_%d_syst" % (shape, ibin2)
    quark_pythia_syst = file_results.Get(nameobj)
    if not quark_pythia_syst:
        logger.fatal(make_message_notfound(nameobj, file_in))

    # gluon PYTHIA
    nameobj = "%s_gluon_pythia_%d_stat" % (shape, ibin2)
    gluon_pythia_stat = file_results.Get(nameobj)
    if not gluon_pythia_stat:
        logger.fatal(make_message_notfound(nameobj, file_in))
    nameobj = "%s_gluon_pythia_%d_syst" % (shape, ibin2)
    gluon_pythia_syst = file_results.Get(nameobj)
    if not gluon_pythia_syst:
        logger.fatal(make_message_notfound(nameobj, file_in))

    # plot the results with systematic uncertainties and models

    size_can = [800, 800]
    offsets_axes = [0.8, 1.1]
    margins_can = [0.1, 0.13, 0.1, 0.03]
    size_thg = 0.05
    offset_thg = 0.85

    gStyle.SetErrorX(0)  # do not plot horizontal error bars of histograms
    fontsize = 0.035
    opt_leg_g = "FP"
    opt_plot_g = "2"

    list_new = []  # list to avoid loosing objects created in loops

    # labels

    x_latex = 0.16
    y_latex_top = 0.83
    y_step = 0.055

    title_x = v_varshape_latex
    title_y = "(1/#it{N}_{jet}) d#it{N}/d%s" % v_varshape_latex
    title_full = ";%s;%s" % (title_x, title_y)
    title_full_ratio = ";%s;data/MC: ratio of %s" % (title_x, title_y)

    text_alice = "#bf{ALICE} Preliminary, pp, #sqrt{#it{s}} = 13 TeV"
    text_alice_sim = "#bf{ALICE} Simulation, pp, #sqrt{#it{s}} = 13 TeV"
    text_pythia = "PYTHIA 8 (Monash)"
    text_pythia_split = "#splitline{PYTHIA 8}{(Monash)}"
    text_jets = "charged jets, anti-#it{k}_{T}, #it{R} = 0.4"
    text_ptjet = "%g #leq %s < %g GeV/#it{c}, #left|#it{#eta}_{jet}#right| #leq 0.5" % (
        lvar2_binmin_reco[ibin2], p_latexbin2var, lvar2_binmax_reco[ibin2])
    text_pth = "%g #leq #it{p}_{T}^{%s} < %g GeV/#it{c}, #left|#it{y}_{%s}#right| #leq 0.8" % (
        lpt_finbinmin[0], p_latexnhadron,
        min(lpt_finbinmax[-1], lvar2_binmax_reco[ibin2]), p_latexnhadron)
    text_ptcut = "#it{p}_{T, incl. ch. jet}^{leading track} #geq 5.33 GeV/#it{c}"
    text_ptcut_sim = "#it{p}_{T, incl. ch. jet}^{leading h^{#pm}} #geq 5.33 GeV/#it{c} (varied)"
    text_sd = "Soft Drop (#it{z}_{cut} = 0.1, #it{#beta} = 0)"

    title_thetag = "#it{#theta}_{g} = #it{R}_{g}/#it{R}"
    radius_jet = 0.4

    # colour and marker indeces
    c_hf_data = 0
    c_incl_data = 1
    c_hf_mc = 2
    c_incl_mc = 6
    c_quark_mc = 5
    c_gluon_mc = 0

    # markers
    m_hf_data = get_marker(0)
    m_incl_data = get_marker(1)
    m_hf_mc = get_marker(0, 2)
    m_incl_mc = get_marker(1, 2)
    m_quark_mc = get_marker(2)
    m_gluon_mc = get_marker(3)

    # make the horizontal error bars smaller
    if shape == "nsd":
        for gr in [
                hf_data_syst, incl_data_syst, hf_ratio_syst, incl_ratio_syst,
                incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst
        ]:
            for i in range(gr.GetN()):
                gr.SetPointEXlow(i, 0.1)
                gr.SetPointEXhigh(i, 0.1)

    # data, HF and inclusive

    hf_data_syst_cl = hf_data_syst.Clone()

    leg_pos = [.72, .75, .85, .85]
    list_obj = [hf_data_syst, incl_data_syst, hf_data_stat, incl_data_stat]
    labels_obj = ["%s-tagged" % p_latexnhadron, "inclusive", "", ""]
    colours = [
        get_colour(i, j) for i, j in zip((c_hf_data, c_incl_data, c_hf_data,
                                          c_incl_data), (2, 2, 1, 1))
    ]
    markers = [m_hf_data, m_incl_data, m_hf_data, m_incl_data]
    y_margin_up = 0.46
    y_margin_down = 0.05
    cshape_data, list_obj_data_new = make_plot("cshape_data_" + suffix, size=size_can, \
        list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \
        colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \
        title=title_full)
    for gr, c in zip((hf_data_syst, incl_data_syst), (c_hf_data, c_incl_data)):
        gr.SetMarkerColor(get_colour(c))
    list_obj_data_new[0].SetTextSize(fontsize)
    if shape == "nsd":
        hf_data_syst.GetXaxis().SetNdivisions(5)
    # Draw a line through the points.
    if shape == "nsd":
        for h in (hf_data_stat, incl_data_stat):
            h_line = h.Clone(h.GetName() + "_line")
            h_line.SetLineStyle(2)
            h_line.Draw("l hist same")
            list_new.append(h_line)
    cshape_data.Update()
    if shape == "rg":
        # plot the theta_g axis
        gr_frame = hf_data_syst
        axis_rg = gr_frame.GetXaxis()
        rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst())
        rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast())
        thetag_min = rg_min / radius_jet
        thetag_max = rg_max / radius_jet
        y_axis = cshape_data.GetUymax()
        axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min,
                             thetag_max, 510, "-")
        axis_thetag.SetTitle(title_thetag)
        axis_thetag.SetTitleSize(size_thg)
        axis_thetag.SetLabelSize(0.036)
        axis_thetag.SetTitleFont(42)
        axis_thetag.SetLabelFont(42)
        axis_thetag.SetLabelOffset(0)
        axis_thetag.SetTitleOffset(offset_thg)
        cshape_data.SetTickx(0)
        axis_thetag.Draw("same")
    # Draw LaTeX
    y_latex = y_latex_top
    list_latex_data = []
    for text_latex in [
            text_alice, text_jets, text_ptjet, text_pth, text_ptcut, text_sd
    ]:
        latex = TLatex(x_latex, y_latex, text_latex)
        list_latex_data.append(latex)
        draw_latex(latex, textsize=fontsize)
        y_latex -= y_step
    cshape_data.Update()
    cshape_data.SaveAs("%s/%s_data_%s.pdf" % (rootpath, shape, suffix))

    # data and PYTHIA, HF

    leg_pos = [.72, .65, .85, .85]
    list_obj = [hf_data_syst_cl, hf_data_stat, hf_pythia_stat]
    labels_obj = ["data", "", text_pythia_split]
    colours = [
        get_colour(i, j)
        for i, j in zip((c_hf_data, c_hf_data, c_hf_mc), (2, 1, 1))
    ]
    markers = [m_hf_data, m_hf_data, m_hf_mc]
    y_margin_up = 0.4
    y_margin_down = 0.05
    cshape_data_mc_hf, list_obj_data_mc_hf_new = make_plot("cshape_data_mc_hf_" + suffix, size=size_can, \
        list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \
        colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \
        title=title_full)
    for gr, c in zip([hf_data_syst_cl], [c_hf_data]):
        gr.SetMarkerColor(get_colour(c))
    leg_data_mc_hf = list_obj_data_mc_hf_new[0]
    leg_data_mc_hf.SetHeader("%s-tagged" % p_latexnhadron)
    leg_data_mc_hf.SetTextSize(fontsize)
    if shape == "nsd":
        hf_data_syst_cl.GetXaxis().SetNdivisions(5)
        #axis_nsd = hf_data_syst_cl.GetHistogram().GetXaxis()
        #x1 = axis_nsd.GetBinLowEdge(1)
        #x2 = axis_nsd.GetBinUpEdge(axis_nsd.GetNbins())
        #axis_nsd.Set(5, x1, x2)
        #for ibin in range(axis_nsd.GetNbins()):
        #    axis_nsd.SetBinLabel(ibin + 1, "%d" % ibin)
        #axis_nsd.SetNdivisions(5)
    cshape_data_mc_hf.Update()
    if shape == "rg":
        # plot the theta_g axis
        axis_rg = hf_data_stat.GetXaxis()
        rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst())
        rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast())
        thetag_min = rg_min / radius_jet
        thetag_max = rg_max / radius_jet
        y_axis = cshape_data_mc_hf.GetUymax()
        axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min,
                             thetag_max, 510, "-")
        axis_thetag.SetTitle(title_thetag)
        axis_thetag.SetTitleSize(size_thg)
        axis_thetag.SetLabelSize(0.036)
        axis_thetag.SetTitleFont(42)
        axis_thetag.SetLabelFont(42)
        axis_thetag.SetLabelOffset(0)
        axis_thetag.SetTitleOffset(offset_thg)
        cshape_data_mc_hf.SetTickx(0)
        axis_thetag.Draw("same")
    # Draw LaTeX
    y_latex = y_latex_top
    list_latex_data_mc_hf = []
    for text_latex in [text_alice, text_jets, text_ptjet, text_pth, text_sd]:
        latex = TLatex(x_latex, y_latex, text_latex)
        list_latex_data_mc_hf.append(latex)
        draw_latex(latex, textsize=fontsize)
        y_latex -= y_step
    cshape_data_mc_hf.Update()
    cshape_data_mc_hf.SaveAs("%s/%s_data_mc_hf_%s.pdf" %
                             (rootpath, shape, suffix))

    # data and PYTHIA, inclusive

    #leg_pos = [.68, .65, .85, .85]
    list_obj = [
        incl_data_syst, incl_pythia_syst, incl_data_stat, incl_pythia_stat
    ]
    labels_obj = ["data", text_pythia_split]
    colours = [
        get_colour(i, j) for i, j in zip((c_incl_data, c_incl_mc, c_incl_data,
                                          c_incl_mc), (2, 2, 1, 1))
    ]
    markers = [m_incl_data, m_incl_mc, m_incl_data, m_incl_mc]
    y_margin_up = 0.4
    y_margin_down = 0.05
    cshape_data_mc_incl, list_obj_data_mc_incl_new = make_plot("cshape_data_mc_incl_" + suffix, size=size_can, \
        list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \
        colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \
        title=title_full)
    for gr, c in zip([incl_data_syst, incl_pythia_syst],
                     [c_incl_data, c_incl_mc]):
        gr.SetMarkerColor(get_colour(c))
    leg_data_mc_incl = list_obj_data_mc_incl_new[0]
    leg_data_mc_incl.SetHeader("inclusive")
    leg_data_mc_incl.SetTextSize(fontsize)
    if shape == "nsd":
        incl_data_syst.GetXaxis().SetNdivisions(5)
    cshape_data_mc_incl.Update()
    if shape == "rg":
        # plot the theta_g axis
        axis_rg = incl_data_stat.GetXaxis()
        rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst())
        rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast())
        thetag_min = rg_min / radius_jet
        thetag_max = rg_max / radius_jet
        y_axis = cshape_data_mc_incl.GetUymax()
        axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min,
                             thetag_max, 510, "-")
        axis_thetag.SetTitle(title_thetag)
        axis_thetag.SetTitleSize(size_thg)
        axis_thetag.SetLabelSize(0.036)
        axis_thetag.SetTitleFont(42)
        axis_thetag.SetLabelFont(42)
        axis_thetag.SetLabelOffset(0)
        axis_thetag.SetTitleOffset(offset_thg)
        cshape_data_mc_incl.SetTickx(0)
        axis_thetag.Draw("same")
    # Draw LaTeX
    y_latex = y_latex_top
    list_latex_data_mc_incl = []
    for text_latex in [text_alice, text_jets, text_ptjet, text_ptcut, text_sd]:
        latex = TLatex(x_latex, y_latex, text_latex)
        list_latex_data_mc_incl.append(latex)
        draw_latex(latex, textsize=fontsize)
        y_latex -= y_step
    cshape_data_mc_incl.Update()
    cshape_data_mc_incl.SaveAs("%s/%s_data_mc_incl_%s.pdf" %
                               (rootpath, shape, suffix))

    # Ratios data/MC, HF and inclusive

    line_1 = TLine(lvarshape_binmin_reco[0], 1, lvarshape_binmax_reco[-1], 1)
    line_1.SetLineStyle(9)
    line_1.SetLineColor(1)
    line_1.SetLineWidth(3)

    #leg_pos = [.72, .7, .85, .85] # with header
    leg_pos = [.72, .75, .85, .85]  # without header
    list_obj = [
        hf_ratio_syst, line_1, incl_ratio_syst, hf_ratio_stat, incl_ratio_stat
    ]
    labels_obj = ["%s-tagged" % p_latexnhadron, "inclusive"]
    colours = [
        get_colour(i, j) for i, j in zip((c_hf_data, c_incl_data, c_hf_data,
                                          c_incl_data), (2, 2, 1, 1))
    ]
    markers = [m_hf_data, m_incl_data, m_hf_data, m_incl_data]
    y_margin_up = 0.52
    y_margin_down = 0.05
    if shape == "nsd":
        y_margin_up = 0.22
    cshape_ratio, list_obj_ratio_new = make_plot("cshape_ratio_" + suffix, size=size_can, \
        list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \
        colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \
        title=title_full_ratio)
    cshape_ratio.Update()
    for gr, c in zip((hf_ratio_syst, incl_ratio_syst),
                     (c_hf_data, c_incl_data)):
        gr.SetMarkerColor(get_colour(c))
    leg_ratio = list_obj_ratio_new[0]
    leg_ratio.SetTextSize(fontsize)
    #leg_ratio.SetHeader("data/MC")
    if shape == "nsd":
        hf_ratio_syst.GetXaxis().SetNdivisions(5)
    cshape_ratio.Update()
    if shape == "rg":
        # plot the theta_g axis
        gr_frame = hf_ratio_syst
        axis_rg = gr_frame.GetXaxis()
        rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst())
        rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast())
        thetag_min = rg_min / radius_jet
        thetag_max = rg_max / radius_jet
        y_axis = cshape_ratio.GetUymax()
        axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min,
                             thetag_max, 510, "-")
        axis_thetag.SetTitle(title_thetag)
        axis_thetag.SetTitleSize(size_thg)
        axis_thetag.SetLabelSize(0.036)
        axis_thetag.SetTitleFont(42)
        axis_thetag.SetLabelFont(42)
        axis_thetag.SetLabelOffset(0)
        axis_thetag.SetTitleOffset(offset_thg)
        cshape_ratio.SetTickx(0)
        axis_thetag.Draw("same")
    # Draw LaTeX
    y_latex = y_latex_top
    list_latex_ratio = []
    for text_latex in [
            text_alice, text_jets, text_ptjet, text_pth, text_ptcut, text_sd,
            text_pythia
    ]:
        latex = TLatex(x_latex, y_latex, text_latex)
        list_latex_ratio.append(latex)
        draw_latex(latex, textsize=fontsize)
        y_latex -= y_step
    cshape_ratio.Update()
    cshape_ratio.SaveAs("%s/%s_ratio_%s.pdf" % (rootpath, shape, suffix))

    # PYTHIA, HF, inclusive, quark, gluon

    incl_pythia_syst_cl = incl_pythia_syst.Clone()

    y_min_h, y_max_h = get_y_window_his([
        hf_pythia_stat, incl_pythia_stat, quark_pythia_stat, gluon_pythia_stat
    ])
    y_min_g, y_max_g = get_y_window_gr(
        [incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst])
    y_min = min(y_min_h, y_min_g)
    y_max = max(y_max_h, y_max_g)
    y_margin_up = 0.46
    y_margin_down = 0.05
    y_min_plot, y_max_plot = get_plot_range(y_min, y_max, y_margin_down,
                                            y_margin_up)

    #leg_pos = [.6, .65, .75, .85]
    leg_pos = [.72, .55, .85, .85]
    list_obj = [
        incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst, hf_pythia_stat,
        incl_pythia_stat, quark_pythia_stat, gluon_pythia_stat
    ]
    labels_obj = ["inclusive", "quark", "gluon", "%s-tagged" % p_latexnhadron]
    colours = [
        get_colour(i, j)
        for i, j in zip((c_incl_mc, c_quark_mc, c_gluon_mc, c_hf_mc, c_incl_mc,
                         c_quark_mc, c_gluon_mc), (2, 2, 2, 1, 1, 1, 1))
    ]
    markers = [
        m_incl_mc, m_quark_mc, m_gluon_mc, m_hf_mc, m_incl_mc, m_quark_mc,
        m_gluon_mc
    ]
    y_margin_up = 0.46
    y_margin_down = 0.05
    cshape_mc, list_obj_mc_new = make_plot("cshape_mc_" + suffix, size=size_can, \
        list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \
        colours=colours, markers=markers, leg_pos=leg_pos, range_y=[y_min_plot, y_max_plot], margins_c=margins_can, \
        title=title_full)
    cshape_mc.Update()
    for gr, c in zip((incl_pythia_syst, quark_pythia_syst, gluon_pythia_syst),
                     (c_incl_mc, c_quark_mc, c_gluon_mc)):
        gr.SetMarkerColor(get_colour(c))
    leg_mc = list_obj_mc_new[0]
    leg_mc.SetTextSize(fontsize)
    leg_mc.SetHeader(text_pythia_split)
    if shape == "nsd":
        incl_pythia_syst.GetXaxis().SetNdivisions(5)
    cshape_mc.Update()
    if shape == "rg":
        # plot the theta_g axis
        axis_rg = hf_pythia_stat.GetXaxis()
        rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst())
        rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast())
        thetag_min = rg_min / radius_jet
        thetag_max = rg_max / radius_jet
        y_axis = cshape_mc.GetUymax()
        axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min,
                             thetag_max, 510, "-")
        axis_thetag.SetTitle(title_thetag)
        axis_thetag.SetTitleSize(size_thg)
        axis_thetag.SetLabelSize(0.036)
        axis_thetag.SetTitleFont(42)
        axis_thetag.SetLabelFont(42)
        axis_thetag.SetLabelOffset(0)
        axis_thetag.SetTitleOffset(offset_thg)
        cshape_mc.SetTickx(0)
        axis_thetag.Draw("same")
    # Draw LaTeX
    y_latex = y_latex_top
    list_latex_mc = []
    for text_latex in [
            text_alice_sim, text_jets, text_ptjet, text_pth, text_ptcut_sim,
            text_sd
    ]:
        latex = TLatex(x_latex, y_latex, text_latex)
        list_latex_mc.append(latex)
        draw_latex(latex, textsize=fontsize)
        y_latex -= y_step
    cshape_mc.Update()
    cshape_mc.SaveAs("%s/%s_mc_%s.pdf" % (rootpath, shape, suffix))

    # PYTHIA, HF, quark, gluon

    #leg_pos = [.6, .65, .75, .85]
    leg_pos = [.72, .61, .85, .85]
    list_obj = [
        quark_pythia_syst, gluon_pythia_syst, hf_pythia_stat,
        quark_pythia_stat, gluon_pythia_stat
    ]
    labels_obj = ["quark", "gluon", "%s-tagged" % p_latexnhadron]
    colours = [
        get_colour(i, j)
        for i, j in zip((c_quark_mc, c_gluon_mc, c_hf_mc, c_quark_mc,
                         c_gluon_mc), (2, 2, 1, 1, 1))
    ]
    markers = [m_quark_mc, m_gluon_mc, m_hf_mc, m_quark_mc, m_gluon_mc]
    y_margin_up = 0.46
    y_margin_down = 0.05
    cshape_mc, list_obj_mc_new = make_plot("cshape_mc_qgd_" + suffix, size=size_can, \
        list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \
        colours=colours, markers=markers, leg_pos=leg_pos, range_y=[y_min_plot, y_max_plot], margins_c=margins_can, \
        title=title_full)
    cshape_mc.Update()
    for gr, c in zip((quark_pythia_syst, gluon_pythia_syst),
                     (c_quark_mc, c_gluon_mc)):
        gr.SetMarkerColor(get_colour(c))
    leg_mc = list_obj_mc_new[0]
    leg_mc.SetTextSize(fontsize)
    leg_mc.SetHeader(text_pythia_split)
    if shape == "nsd":
        quark_pythia_syst.GetXaxis().SetNdivisions(5)
    cshape_mc.Update()
    if shape == "rg":
        # plot the theta_g axis
        axis_rg = hf_pythia_stat.GetXaxis()
        rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst())
        rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast())
        thetag_min = rg_min / radius_jet
        thetag_max = rg_max / radius_jet
        y_axis = cshape_mc.GetUymax()
        axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min,
                             thetag_max, 510, "-")
        axis_thetag.SetTitle(title_thetag)
        axis_thetag.SetTitleSize(size_thg)
        axis_thetag.SetLabelSize(0.036)
        axis_thetag.SetTitleFont(42)
        axis_thetag.SetLabelFont(42)
        axis_thetag.SetLabelOffset(0)
        axis_thetag.SetTitleOffset(offset_thg)
        cshape_mc.SetTickx(0)
        axis_thetag.Draw("same")
    # Draw LaTeX
    y_latex = y_latex_top
    list_latex_mc = []
    for text_latex in [
            text_alice_sim, text_jets, text_ptjet, text_pth, text_ptcut_sim,
            text_sd
    ]:
        latex = TLatex(x_latex, y_latex, text_latex)
        list_latex_mc.append(latex)
        draw_latex(latex, textsize=fontsize)
        y_latex -= y_step
    cshape_mc.Update()
    cshape_mc.SaveAs("%s/%s_mc_qgd_%s.pdf" % (rootpath, shape, suffix))

    # PYTHIA, HF, inclusive

    #leg_pos = [.6, .65, .75, .85]
    leg_pos = [.72, .67, .85, .85]
    list_obj = [incl_pythia_syst_cl, incl_pythia_stat, hf_pythia_stat]
    labels_obj = ["inclusive", "", "%s-tagged" % p_latexnhadron]
    colours = [
        get_colour(i, j)
        for i, j in zip((c_incl_mc, c_incl_mc, c_hf_mc), (2, 1, 1))
    ]
    markers = [m_incl_mc, m_incl_mc, m_hf_mc]
    y_margin_up = 0.46
    y_margin_down = 0.05
    cshape_mc, list_obj_mc_new = make_plot("cshape_mc_id_" + suffix, size=size_can, \
        list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \
        colours=colours, markers=markers, leg_pos=leg_pos, range_y=[y_min_plot, y_max_plot], margins_c=margins_can, \
        title=title_full)
    # Draw a line through the points.
    if shape == "nsd":
        for h in (incl_pythia_stat, hf_pythia_stat):
            h_line = h.Clone(h.GetName() + "_line")
            h_line.SetLineStyle(2)
            h_line.Draw("l hist same")
            list_new.append(h_line)
    cshape_mc.Update()
    incl_pythia_syst_cl.SetMarkerColor(get_colour(c_incl_mc))
    leg_mc = list_obj_mc_new[0]
    leg_mc.SetTextSize(fontsize)
    leg_mc.SetHeader(text_pythia_split)
    if shape == "nsd":
        incl_pythia_syst_cl.GetXaxis().SetNdivisions(5)
    cshape_mc.Update()
    if shape == "rg":
        # plot the theta_g axis
        axis_rg = hf_pythia_stat.GetXaxis()
        rg_min = axis_rg.GetBinLowEdge(axis_rg.GetFirst())
        rg_max = axis_rg.GetBinUpEdge(axis_rg.GetLast())
        thetag_min = rg_min / radius_jet
        thetag_max = rg_max / radius_jet
        y_axis = cshape_mc.GetUymax()
        axis_thetag = TGaxis(rg_min, y_axis, rg_max, y_axis, thetag_min,
                             thetag_max, 510, "-")
        axis_thetag.SetTitle(title_thetag)
        axis_thetag.SetTitleSize(size_thg)
        axis_thetag.SetLabelSize(0.036)
        axis_thetag.SetTitleFont(42)
        axis_thetag.SetLabelFont(42)
        axis_thetag.SetLabelOffset(0)
        axis_thetag.SetTitleOffset(offset_thg)
        cshape_mc.SetTickx(0)
        axis_thetag.Draw("same")
    # Draw LaTeX
    y_latex = y_latex_top
    list_latex_mc = []
    for text_latex in [
            text_alice_sim, text_jets, text_ptjet, text_pth, text_ptcut_sim,
            text_sd
    ]:
        latex = TLatex(x_latex, y_latex, text_latex)
        list_latex_mc.append(latex)
        draw_latex(latex, textsize=fontsize)
        y_latex -= y_step
    cshape_mc.Update()
    cshape_mc.SaveAs("%s/%s_mc_id_%s.pdf" % (rootpath, shape, suffix))

    # data inclusive vs PYTHIA, quark, gluon

    #leg_pos = [.6, .65, .75, .85]
    #leg_pos = [.72, .55, .85, .85]
    leg_pos = [.6, .7, .85, .85]
    list_obj = [
        incl_data_syst, quark_pythia_syst, gluon_pythia_syst, incl_data_stat,
        quark_pythia_stat, gluon_pythia_stat
    ]
    labels_obj = ["inclusive (data)", "quark (PYTHIA 8)", "gluon (PYTHIA 8)"]
    colours = [
        get_colour(i, j)
        for i, j in zip((c_incl_data, c_quark_mc, c_gluon_mc, c_incl_data,
                         c_quark_mc, c_gluon_mc), (2, 2, 2, 1, 1, 1))
    ]
    markers = [
        m_incl_data, m_quark_mc, m_gluon_mc, m_incl_data, m_quark_mc,
        m_gluon_mc
    ]
    y_margin_up = 0.3
    y_margin_down = 0.05
    cshape_mc, list_obj_mc_new = make_plot("cshape_mc_data_iqg" + suffix, size=size_can, \
        list_obj=list_obj, labels_obj=labels_obj, opt_leg_g=opt_leg_g, opt_plot_g=opt_plot_g, offsets_xy=offsets_axes, \
        colours=colours, markers=markers, leg_pos=leg_pos, margins_y=[y_margin_down, y_margin_up], margins_c=margins_can, \
        title=title_full)
    for gr, c in zip((incl_data_syst, quark_pythia_syst, gluon_pythia_syst),
                     (c_incl_data, c_quark_mc, c_gluon_mc)):
        gr.SetMarkerColor(get_colour(c))
    leg_mc = list_obj_mc_new[0]
    leg_mc.SetTextSize(fontsize)
    cshape_mc.Update()
    cshape_mc.SaveAs("%s/%s_data_i_mc_qg_%s.pdf" % (rootpath, shape, suffix))
Ejemplo n.º 21
0
 def reset_input(self, dataframe, tag):
     self.source_dataframe = dataframe
     self.collection_tag = tag
     if self.verbose:
         get_logger().info("Resetting ValidationCollection with tag '%s'",
                           self.collection_tag)
Ejemplo n.º 22
0
def do_entire_analysis(data_config: dict, data_param: dict, data_param_overwrite: dict, # pylint: disable=too-many-locals, too-many-statements, too-many-branches
                       data_model: dict, run_param: dict, clean: bool):

    # Disable any graphical stuff. No TCanvases opened and shown by default
    gROOT.SetBatch(True)

    logger = get_logger()
    logger.info("Do analysis chain")

    # If we are here we are interested in the very first key in the parameters database
    case = list(data_param.keys())[0]

    # Update database accordingly if needed
    update_config(data_param, data_config, data_param_overwrite)

    dodownloadalice = data_config["download"]["alice"]["activate"]
    doconversionmc = data_config["conversion"]["mc"]["activate"]
    doconversiondata = data_config["conversion"]["data"]["activate"]
    domergingmc = data_config["merging"]["mc"]["activate"]
    domergingdata = data_config["merging"]["data"]["activate"]
    doskimmingmc = data_config["skimming"]["mc"]["activate"]
    doskimmingdata = data_config["skimming"]["data"]["activate"]
    domergingperiodsmc = data_config["mergingperiods"]["mc"]["activate"]
    domergingperiodsdata = data_config["mergingperiods"]["data"]["activate"]
    doml = data_config["ml_study"]["activate"]
    docorrelation = data_config["ml_study"]['docorrelation']
    dotraining = data_config["ml_study"]['dotraining']
    dotesting = data_config["ml_study"]['dotesting']
    doapplytodatamc = data_config["ml_study"]['doapplytodatamc']
    docrossvalidation = data_config["ml_study"]['docrossvalidation']
    dolearningcurve = data_config["ml_study"]['dolearningcurve']
    doroc = data_config["ml_study"]['doroc']
    doroctraintest = data_config["ml_study"]['doroctraintest']
    doboundary = data_config["ml_study"]['doboundary']
    doimportance = data_config["ml_study"]['doimportance']
    doimportanceshap = data_config["ml_study"]['doimportanceshap']
    dogridsearch = data_config["ml_study"]['dogridsearch']
    dobayesianopt = data_config["ml_study"]['dobayesianopt']
    doefficiencyml = data_config["ml_study"]['doefficiency']
    dosignifopt = data_config["ml_study"]['dosignifopt']
    doscancuts = data_config["ml_study"]["doscancuts"]
    doplotdistr = data_config["ml_study"]["doplotdistr"]
    doapplydata = data_config["mlapplication"]["data"]["doapply"]
    doapplymc = data_config["mlapplication"]["mc"]["doapply"]
    domergeapplydata = data_config["mlapplication"]["data"]["domergeapply"]
    domergeapplymc = data_config["mlapplication"]["mc"]["domergeapply"]
    docontinueapplydata = data_config["mlapplication"]["data"]["docontinueafterstop"]
    docontinueapplymc = data_config["mlapplication"]["mc"]["docontinueafterstop"]
    dohistomassmc = data_config["analysis"]["mc"]["histomass"]
    dohistomassdata = data_config["analysis"]["data"]["histomass"]
    doefficiency = data_config["analysis"]["mc"]["efficiency"]
    doresponse = data_config["analysis"]["mc"]["response"]
    dofeeddown = data_config["analysis"]["mc"]["feeddown"]
    dounfolding = data_config["analysis"]["mc"]["dounfolding"]
    dojetsystematics = data_config["analysis"]["data"]["dojetsystematics"]
    dofit = data_config["analysis"]["dofit"]
    doeff = data_config["analysis"]["doeff"]
    docross = data_config["analysis"]["docross"]
    doplotsval = data_config["analysis"]["doplotsval"]
    doplots = data_config["analysis"]["doplots"]
    dosyst = data_config["analysis"]["dosyst"]
    dosystprob = data_config["systematics"]["cutvar"]["activate"]
    do_syst_prob_mass = data_config["systematics"]["cutvar"]["probvariationmass"]
    do_syst_prob_eff = data_config["systematics"]["cutvar"]["probvariationeff"]
    do_syst_prob_fit = data_config["systematics"]["cutvar"]["probvariationfit"]
    do_syst_prob_cross = data_config["systematics"]["cutvar"]["probvariationcross"]
    dosystptshape = data_config["systematics"]["mcptshape"]["activate"]
    doanaperperiod = data_config["analysis"]["doperperiod"]
    typean = data_config["analysis"]["type"]

    dojetstudies = data_config["analysis"]["dojetstudies"]

    dirpklmc = data_param[case]["multi"]["mc"]["pkl"]
    dirpklevtcounter_allmc = data_param[case]["multi"]["mc"]["pkl_evtcounter_all"]
    dirpklskmc = data_param[case]["multi"]["mc"]["pkl_skimmed"]
    dirpklmlmc = data_param[case]["multi"]["mc"]["pkl_skimmed_merge_for_ml"]
    dirpklmltotmc = data_param[case]["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"]
    dirpkldata = data_param[case]["multi"]["data"]["pkl"]
    dirpklevtcounter_alldata = data_param[case]["multi"]["data"]["pkl_evtcounter_all"]
    dirpklskdata = data_param[case]["multi"]["data"]["pkl_skimmed"]
    dirpklmldata = data_param[case]["multi"]["data"]["pkl_skimmed_merge_for_ml"]
    dirpklmltotdata = data_param[case]["multi"]["data"]["pkl_skimmed_merge_for_ml_all"]
    dirpklskdecmc = data_param[case]["mlapplication"]["mc"]["pkl_skimmed_dec"]
    dirpklskdec_mergedmc = data_param[case]["mlapplication"]["mc"]["pkl_skimmed_decmerged"]
    dirpklskdecdata = data_param[case]["mlapplication"]["data"]["pkl_skimmed_dec"]
    dirpklskdec_mergeddata = data_param[case]["mlapplication"]["data"]["pkl_skimmed_decmerged"]

    dirresultsdata = data_param[case]["analysis"][typean]["data"]["results"]
    dirresultsmc = data_param[case]["analysis"][typean]["mc"]["results"]
    dirresultsdatatot = data_param[case]["analysis"][typean]["data"]["resultsallp"]
    dirresultsmctot = data_param[case]["analysis"][typean]["mc"]["resultsallp"]

    binminarray = data_param[case]["ml"]["binmin"]
    binmaxarray = data_param[case]["ml"]["binmax"]
    raahp = data_param[case]["ml"]["opt"]["raahp"]
    mltype = data_param[case]["ml"]["mltype"]
    training_vars = data_param[case]["variables"]["var_training"]

    mlout = data_param[case]["ml"]["mlout"]
    mlplot = data_param[case]["ml"]["mlplot"]

    proc_type = data_param[case]["analysis"][typean]["proc_type"]

    #creating folder if not present
    counter = 0
    if doconversionmc is True:
        counter = counter + checkdirlist(dirpklmc)

    if doconversiondata is True:
        counter = counter + checkdirlist(dirpkldata)

    if doskimmingmc is True:
        checkdirlist(dirpklskmc)
        counter = counter + checkdir(dirpklevtcounter_allmc)

    if doskimmingdata is True:
        counter = counter + checkdirlist(dirpklskdata)
        counter = counter + checkdir(dirpklevtcounter_alldata)

    if domergingmc is True:
        counter = counter + checkdirlist(dirpklmlmc)

    if domergingdata is True:
        counter = counter + checkdirlist(dirpklmldata)

    if domergingperiodsmc is True:
        counter = counter + checkdir(dirpklmltotmc)

    if domergingperiodsdata is True:
        counter = counter + checkdir(dirpklmltotdata)

    if doml is True:
        counter = counter + checkdir(mlout)
        counter = counter + checkdir(mlplot)

    if docontinueapplymc is False:
        if doapplymc is True:
            counter = counter + checkdirlist(dirpklskdecmc)

        if domergeapplymc is True:
            counter = counter + checkdirlist(dirpklskdec_mergedmc)

    if docontinueapplydata is False:
        if doapplydata is True:
            counter = counter + checkdirlist(dirpklskdecdata)

        if domergeapplydata is True:
            counter = counter + checkdirlist(dirpklskdec_mergeddata)

    if dohistomassmc is True:
        counter = counter + checkdirlist(dirresultsmc)
        counter = counter + checkdir(dirresultsmctot)

    if dohistomassdata is True:
        counter = counter + checkdirlist(dirresultsdata)
        counter = counter + checkdir(dirresultsdatatot)

    if counter < 0:
        sys.exit()
    # check and create directories

    if doconversionmc is True:
        checkmakedirlist(dirpklmc)

    if doconversiondata is True:
        checkmakedirlist(dirpkldata)

    if doskimmingmc is True:
        checkmakedirlist(dirpklskmc)
        checkmakedir(dirpklevtcounter_allmc)

    if doskimmingdata is True:
        checkmakedirlist(dirpklskdata)
        checkmakedir(dirpklevtcounter_alldata)

    if domergingmc is True:
        checkmakedirlist(dirpklmlmc)

    if domergingdata is True:
        checkmakedirlist(dirpklmldata)

    if domergingperiodsmc is True:
        checkmakedir(dirpklmltotmc)

    if domergingperiodsdata is True:
        checkmakedir(dirpklmltotdata)

    if doml is True:
        checkmakedir(mlout)
        checkmakedir(mlplot)

    if docontinueapplymc is False:
        if doapplymc is True:
            checkmakedirlist(dirpklskdecmc)

        if domergeapplymc is True:
            checkmakedirlist(dirpklskdec_mergedmc)

    if docontinueapplydata is False:
        if doapplydata is True:
            checkmakedirlist(dirpklskdecdata)

        if domergeapplydata is True:
            checkmakedirlist(dirpklskdec_mergeddata)

    if dohistomassmc is True:
        checkmakedirlist(dirresultsmc)
        checkmakedir(dirresultsmctot)

    if dohistomassdata is True:
        checkmakedirlist(dirresultsdata)
        checkmakedir(dirresultsdatatot)

    proc_class = Processer
    ana_class = Analyzer
    syst_class = Systematics
    if proc_type == "Dhadrons":
        print("Using new feature for Dhadrons")
        proc_class = ProcesserDhadrons
        ana_class = AnalyzerDhadrons
    if proc_type == "Dhadrons_mult":
        print("Using new feature for Dhadrons_mult")
        proc_class = ProcesserDhadrons_mult
        ana_class = AnalyzerDhadrons_mult
    if proc_type == "Dhadrons_jet":
        print("Using new feature for Dhadrons_jet")
        proc_class = ProcesserDhadrons_jet
        ana_class = AnalyzerJet

    mymultiprocessmc = MultiProcesser(case, proc_class, data_param[case], typean, run_param, "mc")
    mymultiprocessdata = MultiProcesser(case, proc_class, data_param[case], typean, run_param,\
                                        "data")
    ana_mgr = AnalyzerManager(ana_class, data_param[case], case, typean, doanaperperiod)
    # Has to be done always period-by-period
    syst_mgr = AnalyzerManager(syst_class, data_param[case], case, typean, True, run_param)

    #perform the analysis flow
    if dodownloadalice == 1:
        subprocess.call("../cplusutilities/Download.sh")

    if doconversionmc == 1:
        mymultiprocessmc.multi_unpack_allperiods()

    if doconversiondata == 1:
        mymultiprocessdata.multi_unpack_allperiods()

    if doskimmingmc == 1:
        mymultiprocessmc.multi_skim_allperiods()

    if doskimmingdata == 1:
        mymultiprocessdata.multi_skim_allperiods()

    if domergingmc == 1:
        mymultiprocessmc.multi_mergeml_allperiods()

    if domergingdata == 1:
        mymultiprocessdata.multi_mergeml_allperiods()

    if domergingperiodsmc == 1:
        mymultiprocessmc.multi_mergeml_allinone()

    if domergingperiodsdata == 1:
        mymultiprocessdata.multi_mergeml_allinone()

    if doml is True:
        index = 0
        for binmin, binmax in zip(binminarray, binmaxarray):
            myopt = Optimiser(data_param[case], case, typean,
                              data_model[mltype], binmin, binmax,
                              raahp[index], training_vars[index])
            if docorrelation is True:
                myopt.do_corr()
            if dotraining is True:
                myopt.do_train()
            if dotesting is True:
                myopt.do_test()
            if doapplytodatamc is True:
                myopt.do_apply()
            if docrossvalidation is True:
                myopt.do_crossval()
            if dolearningcurve is True:
                myopt.do_learningcurve()
            if doroc is True:
                myopt.do_roc()
            if doroctraintest is True:
                myopt.do_roc_train_test()
            if doplotdistr is True:
                myopt.do_plot_model_pred()
            if doimportance is True:
                myopt.do_importance()
            if doimportanceshap is True:
                myopt.do_importance_shap()
            if dogridsearch is True:
                myopt.do_grid()
            if dobayesianopt is True:
                myopt.do_bayesian_opt()
            if doboundary is True:
                myopt.do_boundary()
            if doefficiencyml is True:
                myopt.do_efficiency()
            if dosignifopt is True:
                myopt.do_significance()
            if doscancuts is True:
                myopt.do_scancuts()
            index = index + 1

    if doapplydata is True:
        mymultiprocessdata.multi_apply_allperiods()
    if doapplymc is True:
        mymultiprocessmc.multi_apply_allperiods()
    if domergeapplydata is True:
        mymultiprocessdata.multi_mergeapply_allperiods()
    if domergeapplymc is True:
        mymultiprocessmc.multi_mergeapply_allperiods()
    if dohistomassmc is True:
        mymultiprocessmc.multi_histomass()
    if dohistomassdata is True:
        # After-burner in case of a mult analysis to obtain "correctionsweight.root"
        # for merged-period data
        # pylint: disable=fixme
        # FIXME Can only be run here because result directories are constructed when histomass
        #       is run. If this step was independent, histomass would always complain that the
        #       result directory already exists.
        mymultiprocessdata.multi_histomass()
    if doefficiency is True:
        mymultiprocessmc.multi_efficiency()
    if doresponse is True:
        mymultiprocessmc.multi_response()

    # Collect all desired analysis steps
    analyze_steps = []
    if dofit is True:
        analyze_steps.append("fit")
    if dosyst is True:
        analyze_steps.append("yield_syst")
    if doeff is True:
        analyze_steps.append("efficiency")
    if dojetstudies is True:
        if dofit is False:
            analyze_steps.append("fit")
        if doeff is False:
            analyze_steps.append("efficiency")
        analyze_steps.append("sideband_sub")
    if dofeeddown is True:
        analyze_steps.append("feeddown")
    if dounfolding is True:
        analyze_steps.append("unfolding")
        analyze_steps.append("unfolding_closure")
    if dojetsystematics is True:
        analyze_steps.append("jetsystematics")
    if docross is True:
        analyze_steps.append("makenormyields")
    if doplots is True:
        analyze_steps.append("plotternormyields")
    if doplotsval is True:
        analyze_steps.append("plottervalidation")

    # Now do the analysis
    ana_mgr.analyze(*analyze_steps)

    ml_syst_steps = []
    if dosystprob is True:
        if do_syst_prob_mass:
            ml_syst_steps.append("ml_cutvar_mass")
        if do_syst_prob_eff:
            ml_syst_steps.append("ml_cutvar_eff")
        if do_syst_prob_fit:
            ml_syst_steps.append("ml_cutvar_fit")
        if do_syst_prob_cross:
            ml_syst_steps.append("ml_cutvar_cross")
    if dosystptshape is True:
        ml_syst_steps.append("mcptshape")
    syst_mgr.analyze(*ml_syst_steps)

    # Delete per-period results.
    if clean:
        print("Cleaning")
        if doanaperperiod:
            print("Per-period analysis enabled. Skipping.")
        else:
            if not delete_dirlist(dirresultsmc + dirresultsdata):
                print("Error: Failed to complete cleaning.")

    print("Done")
Ejemplo n.º 23
0
def efficiency_cutscan(
        dataframe_,
        mylistvariables_,
        modelname_,
        threshold,  # pylint: disable=too-many-statements
        output_,
        suffix_,
        plot_options_=None):

    plot_type_name = "eff_cut_scan"
    plot_options = {}
    if isinstance(plot_options_, dict):
        plot_options = plot_options_.get(plot_type_name, {})
    selml = "y_test_prob%s>%s" % (modelname_, threshold)
    dataframe_ = dataframe_.query(selml)

    fig = plt.figure(figsize=(60, 25))
    gs = GridSpec(3, int(len(mylistvariables_) / 3 + 1))
    axes = [fig.add_subplot(gs[i]) for i in range(len(mylistvariables_))]

    # Available cut options
    cut_options = ["lt", "st", "abslt", "absst"]

    for i, var_tuple in enumerate(mylistvariables_):
        var = var_tuple[0]
        vardir = var_tuple[1]

        axes[i].set_xlabel(var, fontsize=30)
        axes[i].set_ylabel("entries (normalised)", fontsize=30)
        axes[i].tick_params(labelsize=20)
        axes[i].set_yscale('log')
        axes[i].set_ylim(0.1, 1.5)
        values = dataframe_[var].values

        if "abs" in vardir:
            cen = var_tuple[2] if len(var_tuple) > 2 else None
            if cen is None:
                get_logger().error("Absolute cut chosen for %s. " \
                        "However, no central value provided", var)
                continue
            values = np.array([abs(v - cen) for v in values])

        nbinscan = 100
        minv, maxv = values.min(), values.max()
        if var in plot_options and "xlim" in plot_options[var]:
            minv = plot_options[var]["xlim"][0]
            maxv = plot_options[var]["xlim"][1]
        else:
            minv = values.min()
            maxv = values.max()
        _, bina = np.histogram(values, range=(minv, maxv), bins=nbinscan)
        widthbin = (maxv - minv) / (float)(nbinscan)
        width = np.diff(bina)
        center = (bina[:-1] + bina[1:]) / 2
        den = len(values)
        ratios = deque()

        if vardir not in cut_options:
            get_logger().error("Please choose cut option from %s. " \
                    "Your current setting for variable %s is %s", str(cut_options), vardir, var)
            continue

        if "lt" in vardir:
            for ibin in range(nbinscan):
                values = values[values > minv + widthbin * ibin]
                num = len(values)
                eff = float(num) / float(den)
                ratios.append(eff)
        else:
            for ibin in range(nbinscan, 0, -1):
                values = values[values < minv + widthbin * ibin]
                num = len(values)
                eff = float(num) / float(den)
                ratios.appendleft(eff)
        lbl = f'prob > {threshold}'
        axes[i].bar(center, ratios, align='center', width=width, label=lbl)
        axes[i].legend(fontsize=30)
    plotname = join(output_,
                    f"variables_effscan_prob{threshold}_{suffix_}.png")
    plt.savefig(plotname, bbox_inches='tight')
    plt.savefig(plotname, bbox_inches='tight')
 def write(self):
     for i in self.histograms:
         if self.verbose:
             get_logger().info("Writing histogram %s", i.GetName())
         i.Write()
Ejemplo n.º 25
0
    def __init__(self, database: dict, ana_type: str, file_data_name: str,
                 file_mc_name: str):
        """
        Initialize MLFitParsFactory
        Args:
            database: dictionary of the entire analysis database
            ana_type: specifying the analysis within the database to be done
            file_data_name: file path where to find data histograms to fit
            file_mc_name: file path where to find MC histograms to fit
        """

        self.logger = get_logger()

        ana_config = database["analysis"][ana_type]

        self.prob_cut_fin = database["mlapplication"]["probcutoptimal"]

        # File config
        self.file_data_name = file_data_name
        self.file_mc_name = file_mc_name

        # Binning
        self.bin1_name = database["var_binning"]
        self.bins1_edges_low = ana_config["sel_an_binmin"]
        self.bins1_edges_up = ana_config["sel_an_binmax"]
        self.n_bins1 = len(self.bins1_edges_low)
        self.bin2_name = ana_config["var_binning2"]
        self.bin2_gen_name = ana_config["var_binning2_gen"]
        self.bins2_edges_low = ana_config["sel_binmin2"]
        self.bins2_edges_up = ana_config["sel_binmax2"]
        self.n_bins2 = len(self.bins2_edges_low)
        self.bin_matching = ana_config["binning_matching"]

        bineff = ana_config["usesinglebineff"]
        self.bins2_int_bin = bineff if bineff is not None else 0
        # Fit method flags
        self.init_fits_from = ana_config["init_fits_from"]
        self.sig_func_name = ana_config["sgnfunc"]
        self.bkg_func_name = ana_config["bkgfunc"]
        self.fit_range_low = ana_config["massmin"]
        self.fit_range_up = ana_config["massmax"]
        self.likelihood = ana_config["dolikelihood"]
        self.rebin = ana_config["rebin"]
        try:
            iter(self.rebin[0])
        except TypeError:
            self.rebin = [self.rebin for _ in range(self.n_bins2)]

        # Initial fit parameters
        self.mean = ana_config["masspeak"]
        self.fix_mean = ana_config["FixedMean"]
        self.use_user_mean = ana_config["SetInitialGaussianMean"]
        self.sigma = ana_config["sigmaarray"]
        self.fix_sigma = ana_config["SetFixGaussianSigma"]
        self.use_user_sigma = ana_config["SetInitialGaussianSigma"]
        self.max_rel_sigma_diff = ana_config["MaxPercSigmaDeviation"]
        self.n_sigma_sideband = ana_config["exclude_nsigma_sideband"]
        self.n_sigma_signal = ana_config["nsigma_signal"]
        self.rel_sigma_bound = ana_config["MaxPercSigmaDeviation"]

        # Second peak flags
        self.include_sec_peak = ana_config.get("includesecpeak",
                                               [False] * self.n_bins1)
        try:
            iter(self.include_sec_peak[0])
        except TypeError:
            self.include_sec_peak = [
                self.include_sec_peak for _ in range(self.n_bins2)
            ]

        self.sec_mean = ana_config[
            "masssecpeak"] if self.include_sec_peak else None
        self.fix_sec_mean = ana_config.get("fix_masssecpeak",
                                           [False] * self.n_bins1)
        try:
            iter(self.fix_sec_mean[0])
        except TypeError:
            self.fix_sec_mean = [
                self.fix_sec_mean for _ in range(self.n_bins2)
            ]
        self.sec_sigma = ana_config[
            "widthsecpeak"] if self.include_sec_peak else None
        self.fix_sec_sigma = ana_config[
            "fix_widthsecpeak"] if self.include_sec_peak else None

        # Reflections flag
        self.include_reflections = ana_config.get("include_reflection", False)

        # Is this a trigger weighted histogram?
        self.apply_weights = ana_config["triggersel"]["weighttrig"]

        # Systematics
        self.syst_pars = ana_config.get("systematics", {})
        self.syst_init_sigma_from = None
        if self.syst_pars:
            self.syst_init_sigma_from = self.syst_pars.get(
                "init_sigma_from", "central")
            if not isinstance(self.syst_init_sigma_from, list):
                self.syst_init_sigma_from = [self.syst_init_sigma_from
                                             ] * self.n_bins1
            if not isinstance(self.syst_init_sigma_from[0], list):
                self.syst_init_sigma_from = [self.syst_init_sigma_from
                                             ] * self.n_bins2
Ejemplo n.º 26
0
    def preparesample(self):
        logger = get_logger()
        print("prepare sample")
        self.df_data = pd.read_pickle(self.f_reco_data)
        self.df_mc = pd.read_pickle(self.f_reco_mc)
        self.df_mcgen = pd.read_pickle(self.f_gen_mc)
        self.df_mcgen = self.df_mcgen.query(self.p_presel_gen_eff)
        arraydf = [self.df_data, self.df_mc]
        self.df_mc = seldf_singlevar(self.df_mc, self.v_bin, self.p_binmin,
                                     self.p_binmax)
        self.df_mcgen = seldf_singlevar(self.df_mcgen, self.v_bin,
                                        self.p_binmin, self.p_binmax)
        self.df_data = seldf_singlevar(self.df_data, self.v_bin, self.p_binmin,
                                       self.p_binmax)

        self.df_sig, self.df_bkg = arraydf[self.p_tagsig], arraydf[
            self.p_tagbkg]
        self.df_sig = seldf_singlevar(self.df_sig, self.v_bin, self.p_binmin,
                                      self.p_binmax)
        self.df_bkg = seldf_singlevar(self.df_bkg, self.v_bin, self.p_binmin,
                                      self.p_binmax)
        self.df_sig = self.df_sig.query(self.s_selsigml)
        self.df_bkg = self.df_bkg.query(self.s_selbkgml)
        self.df_bkg["ismcsignal"] = 0
        self.df_bkg["ismcprompt"] = 0
        self.df_bkg["ismcfd"] = 0
        self.df_bkg["ismcbkg"] = 0

        if self.p_nsig > len(self.df_sig):
            logger.warning("There are not enough signal events")
        if self.p_nbkg > len(self.df_bkg):
            logger.warning("There are not enough background events")

        self.p_nsig = min(len(self.df_sig), self.p_nsig)
        self.p_nbkg = min(len(self.df_bkg), self.p_nbkg)

        logger.info("Used number of signal events is %d", self.p_nsig)
        logger.info("Used number of background events is %d", self.p_nbkg)

        self.df_ml = pd.DataFrame()
        self.df_sig = shuffle(self.df_sig, random_state=self.rnd_shuffle)
        self.df_bkg = shuffle(self.df_bkg, random_state=self.rnd_shuffle)
        self.df_sig = self.df_sig[:self.p_nsig]
        self.df_bkg = self.df_bkg[:self.p_nbkg]
        self.df_sig[self.v_sig] = 1
        self.df_bkg[self.v_sig] = 0
        self.df_ml = pd.concat([self.df_sig, self.df_bkg])
        self.df_mltrain, self.df_mltest = train_test_split(self.df_ml, \
                                           test_size=self.test_frac, random_state=self.rnd_splt)
        self.df_mltrain = self.df_mltrain.reset_index(drop=True)
        self.df_mltest = self.df_mltest.reset_index(drop=True)
        self.df_sigtrain, self.df_bkgtrain = split_df_sigbkg(
            self.df_mltrain, self.v_sig)
        self.df_sigtest, self.df_bkgtest = split_df_sigbkg(
            self.df_mltest, self.v_sig)
        logger.info("Nev ml train %d and test %d", len(self.df_mltrain),
                    len(self.df_mltest))
        logger.info("Nev signal train %d and test %d", len(self.df_sigtrain),
                    len(self.df_sigtest))
        logger.info("Nev bkg train %d and test %d", len(self.df_bkgtrain),
                    len(self.df_bkgtest))

        self.df_xtrain = self.df_mltrain[self.v_train]
        self.df_ytrain = self.df_mltrain[self.v_sig]
        self.df_xtest = self.df_mltest[self.v_train]
        self.df_ytest = self.df_mltest[self.v_sig]
Ejemplo n.º 27
0
class Optimiser:
    #Class Attribute
    species = "optimiser"

    def __init__(self, data_param, case, model_config, grid_config, binmin,
                 binmax, raahp):

        self.logger = get_logger()

        dirmcml = data_param["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"]
        dirdataml = data_param["multi"]["data"]["pkl_skimmed_merge_for_ml_all"]
        dirdatatotsample = data_param["multi"]["data"]["pkl_evtcounter_all"]
        self.v_bin = data_param["var_binning"]
        #directory
        self.dirmlout = data_param["ml"]["mlout"]
        self.dirmlplot = data_param["ml"]["mlplot"]
        #ml file names
        self.n_reco = data_param["files_names"]["namefile_reco"]
        self.n_reco = self.n_reco.replace(".pkl", "_%s%d_%d.pkl" % (self.v_bin, binmin, binmax))
        self.n_evt = data_param["files_names"]["namefile_evt"]
        self.n_gen = data_param["files_names"]["namefile_gen"]
        self.n_gen = self.n_gen.replace(".pkl", "_%s%d_%d.pkl" % (self.v_bin, binmin, binmax))
        self.n_treetest = data_param["files_names"]["treeoutput"]
        self.n_reco_applieddata = data_param["files_names"]["namefile_reco_applieddata"]
        self.n_reco_appliedmc = data_param["files_names"]["namefile_reco_appliedmc"]
        # ml files
        self.f_gen_mc = os.path.join(dirmcml, self.n_gen)
        self.f_reco_mc = os.path.join(dirmcml, self.n_reco)
        self.f_evt_mc = os.path.join(dirmcml, self.n_evt)
        self.f_reco_data = os.path.join(dirdataml, self.n_reco)
        self.f_evt_data = os.path.join(dirdataml, self.n_evt)
        self.f_evttotsample_data = os.path.join(dirdatatotsample, self.n_evt)
        self.f_reco_applieddata = os.path.join(self.dirmlout, self.n_reco_applieddata)
        self.f_reco_appliedmc = os.path.join(self.dirmlout, self.n_reco_appliedmc)
        #variables
        self.v_all = data_param["variables"]["var_all"]
        self.v_train = data_param["variables"]["var_training"]
        self.v_bound = data_param["variables"]["var_boundaries"]
        self.v_sig = data_param["variables"]["var_signal"]
        self.v_invmass = data_param["variables"]["var_inv_mass"]
        self.v_cuts = data_param["variables"].get("var_cuts", [])
        self.v_corrx = data_param["variables"]["var_correlation"][0]
        self.v_corry = data_param["variables"]["var_correlation"][1]
        self.v_isstd = data_param["bitmap_sel"]["var_isstd"]
        self.v_ismcsignal = data_param["bitmap_sel"]["var_ismcsignal"]
        self.v_ismcprompt = data_param["bitmap_sel"]["var_ismcprompt"]
        self.v_ismcfd = data_param["bitmap_sel"]["var_ismcfd"]
        self.v_ismcbkg = data_param["bitmap_sel"]["var_ismcbkg"]
        #parameters
        self.p_case = case
        self.p_nbkg = data_param["ml"]["nbkg"]
        self.p_nsig = data_param["ml"]["nsig"]
        self.p_tagsig = data_param["ml"]["sampletagforsignal"]
        self.p_tagbkg = data_param["ml"]["sampletagforbkg"]
        self.p_binmin = binmin
        self.p_binmax = binmax
        self.p_npca = None
        self.p_mltype = data_param["ml"]["mltype"]
        self.p_nkfolds = data_param["ml"]["nkfolds"]
        self.p_ncorescross = data_param["ml"]["ncorescrossval"]
        self.rnd_shuffle = data_param["ml"]["rnd_shuffle"]
        self.rnd_splt = data_param["ml"]["rnd_splt"]
        self.test_frac = data_param["ml"]["test_frac"]
        self.p_plot_options = data_param["variables"].get("plot_options", {})
        self.p_dofullevtmerge = data_param["dofullevtmerge"]
        #dataframes
        self.df_mc = None
        self.df_mcgen = None
        self.df_data = None
        self.df_sig = None
        self.df_bkg = None
        self.df_ml = None
        self.df_mltest = None
        self.df_mltrain = None
        self.df_sigtrain = None
        self.df_sigtest = None
        self.df_bkgtrain = None
        self.df_bktest = None
        self.df_xtrain = None
        self.df_ytrain = None
        self.df_xtest = None
        self.df_ytest = None
        #selections
        self.s_selbkgml = data_param["ml"]["sel_bkgml"]
        self.s_selsigml = data_param["ml"]["sel_sigml"]
        #model param
        self.db_model = model_config
        self.p_class = None
        self.p_classname = None
        self.p_trainedmod = None
        self.s_suffix = None
        #config files
        self.c_gridconfig = grid_config

        #significance
        self.f_fonll = data_param["ml"]["opt"]["filename_fonll"]
        self.p_fonllband = data_param["ml"]["opt"]["fonll_pred"]
        self.p_fragf = data_param["ml"]["opt"]["FF"]
        self.p_sigmamb = data_param["ml"]["opt"]["sigma_MB"]
        self.p_taa = data_param["ml"]["opt"]["Taa"]
        self.p_br = data_param["ml"]["opt"]["BR"]
        self.p_fprompt = data_param["ml"]["opt"]["f_prompt"]
        self.p_bkgfracopt = data_param["ml"]["opt"]["bkg_data_fraction"]
        self.p_nstepsign = data_param["ml"]["opt"]["num_steps"]
        self.p_savefit = data_param["ml"]["opt"]["save_fit"]
        self.p_nevtml = None
        self.p_nevttot = None
        self.p_presel_gen_eff = data_param["analysis"]["presel_gen_eff"]
        self.p_mass_fit_lim = data_param["analysis"]['mass_fit_lim']
        self.p_bin_width = data_param["analysis"]['bin_width']
        self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / \
                                     self.p_bin_width))
        self.p_mass = data_param["mass"]
        self.p_raahp = raahp
        self.preparesample()
        self.loadmodels()
        self.create_suffix()
        self.df_evt_data = None
        self.df_evttotsample_data = None

        self.f_reco_applieddata = \
                self.f_reco_applieddata.replace(".pkl", "%s.pkl" % self.s_suffix)
        self.f_reco_appliedmc = \
                self.f_reco_appliedmc.replace(".pkl", "%s.pkl" % self.s_suffix)
Ejemplo n.º 28
0
def calc_bkg(df_bkg, name, num_steps, fit_region, bkg_func, bin_width,
             sig_region, save_fit, out_dir, pt_lims, multiclass_labels):
    """
    Estimate the number of background candidates under the signal peak. This is obtained
    from real data with a fit of the sidebands of the invariant mass distribution.
    """
    logger = get_logger()
    if multiclass_labels is None:
        ns_left = int(num_steps / 10) - 1
        ns_right = num_steps - ns_left
        x_axis_left = np.linspace(0., 0.49, ns_left)
        x_axis_right = np.linspace(0.5, 1.0, ns_right)
        x_axis = np.concatenate((x_axis_left, x_axis_right))
    else:
        x_axis = np.linspace(0, 0.4, num_steps)
    bkg_array = []
    bkg_err_array = []
    num_bins = (fit_region[1] - fit_region[0]) / bin_width
    num_bins = int(round(num_bins))
    bin_width = (fit_region[1] - fit_region[0]) / num_bins

    if save_fit:
        logger.debug("Saving bkg fits to file")
        pt_min = pt_lims[0]
        pt_max = pt_lims[1]
        out_file = TFile(
            f'{out_dir}/bkg_fits_{name}_pt{pt_min:.1f}_{pt_max:.1f}.root',
            'recreate')
        out_file.cd()

    logger.debug("To fit the bkg a %s function is used", bkg_func)
    if multiclass_labels is not None:
        for thr0 in x_axis:
            for thr1 in x_axis:
                bkg = 0.
                bkg_err = 0.
                hmass = TH1F(f'hmass_{thr0:.5f}_{thr1:.5f}', '', num_bins,
                             fit_region[0], fit_region[1])
                mlsel_multi0 = 'y_test_prob' + name + multiclass_labels[
                    0] + ' <= ' + str(thr0)
                mlsel_multi1 = 'y_test_prob' + name + multiclass_labels[
                    1] + ' >= ' + str(thr1)
                mlsel_multi = mlsel_multi0 + ' and ' + mlsel_multi1
                sel_mass_array = df_bkg.query(mlsel_multi)['inv_mass'].values

                if len(sel_mass_array) > 5:
                    for mass_value in np.nditer(sel_mass_array):
                        hmass.Fill(mass_value)
                    fit = hmass.Fit(bkg_func, 'Q', '', fit_region[0],
                                    fit_region[1])
                    if save_fit:
                        hmass.Write()
                    if int(fit) == 0:
                        fit_func = hmass.GetFunction(bkg_func)
                        bkg = fit_func.Integral(sig_region[0],
                                                sig_region[1]) / bin_width
                        bkg_err = fit_func.IntegralError(
                            sig_region[0], sig_region[1]) / bin_width
                        del fit_func
                elif save_fit:
                    hmass.Write()

                bkg_array.append(bkg)
                bkg_err_array.append(bkg_err)
                del hmass
    else:
        for thr in x_axis:
            bkg = 0.
            bkg_err = 0.
            hmass = TH1F(f'hmass_{thr:.5f}', '', num_bins, fit_region[0],
                         fit_region[1])
            bkg_sel_mask = df_bkg['y_test_prob' + name].values >= thr
            sel_mass_array = df_bkg[bkg_sel_mask]['inv_mass'].values

            if len(sel_mass_array) > 5:
                for mass_value in np.nditer(sel_mass_array):
                    hmass.Fill(mass_value)
                fit = hmass.Fit(bkg_func, 'Q', '', fit_region[0],
                                fit_region[1])
                if save_fit:
                    hmass.Write()
                if int(fit) == 0:
                    fit_func = hmass.GetFunction(bkg_func)
                    bkg = fit_func.Integral(sig_region[0],
                                            sig_region[1]) / bin_width
                    bkg_err = fit_func.IntegralError(sig_region[0],
                                                     sig_region[1]) / bin_width
                    del fit_func
            elif save_fit:
                hmass.Write()

            bkg_array.append(bkg)
            bkg_err_array.append(bkg_err)
            del hmass

    if save_fit:
        out_file.Close()

    return bkg_array, bkg_err_array, x_axis
Ejemplo n.º 29
0
def filter_df_cand(dataframe, main_dict, sel_opt):
    '''Filter a dataframe looking at the type of candidate.

    It works both for bitmap and old selection method.
    In 'database_ml_parameters.yml' only one between old_sel and bitmap_sel must have 'use: True'

    Implemented selection options:
        - 'mc_signal' -> select MC signal
        - 'mc_signal_prompt' -> select only prompt MC signal
        - 'mc_signal_FD' -> select only feed-down MC signal
        - 'mc_bkg' -> select MC background
        - 'presel_track_pid' -> select candidates satisfing PID and track pre-selections
        - 'sel_std_analysis' -> select candidates fulfilling the std analysis selections

    Args:
        dataframe: pandas dataframe to filter
        main_dict: dictionary of parameters loaded from 'database_ml_parameters.yml'
        sel_opt: selection option (string)

    Return:
        df_selected: filtered pandas dataframe
    '''
    logger = get_logger()

    bitmap_dict = main_dict['bitmap_sel']
    old_dict = main_dict['old_sel']
    use_bitmap = bitmap_dict['use']
    use_old = old_dict['use']

    if use_bitmap == use_old:
        logger.critical(
            "One and only one of the selection method have to be used, i.e. with "
            "'use' flag set to True")

    if use_bitmap:
        logger.debug("Using bitmap selection")

        if sel_opt == 'mc_signal':
            sel_bits = bitmap_dict['mcsignal_on_off']
        elif sel_opt == 'mc_signal_prompt':
            sel_bits = bitmap_dict['mcsignal_prompt_on_off']
        elif sel_opt == 'mc_signal_FD':
            sel_bits = bitmap_dict['mcsignal_feed_on_off']
        elif sel_opt == 'mc_bkg':
            sel_bits = bitmap_dict['mcbkg_on_off']
        elif sel_opt == 'presel_track_pid':
            sel_bits = bitmap_dict['preseltrack_pid_on_off']
        elif sel_opt == 'sel_std_analysis':
            sel_bits = bitmap_dict['std_analysis_on_off']
        else:
            logger.critical("Wrong selection option!")

        logger.debug("Candidates before selection: %d", len(dataframe))
        df_selected = filter_bit_df(dataframe, bitmap_dict['var_sel'],
                                    sel_bits)
        logger.debug("Candidates after %s selection: %d", sel_opt,
                     len(df_selected))

    if use_old:
        logger.debug("Using old selection")

        if sel_opt == 'mc_signal':
            sel_string = old_dict['mc_signal']
        elif sel_opt == 'mc_signal_prompt':
            sel_string = old_dict['mc_signal_prompt']
        elif sel_opt == 'mc_signal_FD':
            sel_string = old_dict['mc_signal_FD']
        elif sel_opt == 'mc_bkg':
            sel_string = old_dict['mc_bkg']
        elif sel_opt == 'presel_track_pid':
            sel_string = old_dict['presel_track_pid']
        elif sel_opt == 'sel_std_analysis':
            sel_string = old_dict['sel_std_analysis']
        else:
            logger.critical("Wrong selection option!")

        logger.debug("Candidates before selection: %d", len(dataframe))
        df_selected = dataframe.query(sel_string)
        logger.debug("Candidates after %s selection: %d", sel_opt,
                     len(df_selected))

    return df_selected
Ejemplo n.º 30
0
# HF specific imports
from machine_learning_hep.logger import get_logger
# pylint: disable=import-error, no-name-in-module
from ROOT import gStyle


# pylint: disable=too-few-public-methods
class WorkflowBase:
    """
    Base class for all workflows related classes including systematics
    """
    species = "workflow_base"

    def __init__(self, datap, case, typean, period=None):

        self.logger = get_logger()
        self.datap = datap
        self.case = case
        self.typean = typean
        self.period = period

    @staticmethod
    def loadstyle():
        gStyle.SetOptStat(0)
        gStyle.SetOptStat(0000)
        gStyle.SetPalette(1)
        gStyle.SetNumberContours(100)
        gStyle.SetCanvasColor(0)
        gStyle.SetFrameFillColor(0)

    @staticmethod