Esempio n. 1
0
def plot_hist_fit_auto(df, branch, cut_BDT=None, **kwargs):
    """ Retrieve the latex name of the branch and unit. Set the folder name to the name of the datasets.
    Then, plot 2d histogram with plot_hist_fit.

    Parameters
    ----------

    df            : pandas.Dataframe
        dataframe that contains the branch to plot
    branch : str
        branch (for instance: ``'B0_M'``), in dataframe
    cut_BDT         : float or str
        ``BDT > cut_BDT`` cut. Used in the name of saved figure.
    **kwargs : dict
        arguments passed in :py:func:`plot_hist_fit` (except ``branch``, ``latex_branch``, ``unit``)

    Returns
    -------
    fig   : matplotlib.figure.Figure
        Figure of the plot (only if ``axis_mode`` is ``False``)
    ax[0] : matplotlib.figure.Axes
        Axis of the histogram + fitted curves + table
    ax[1] : matplotlib.figure.Axes
        Axis of the pull diagram (only if ``plot_pull`` is ``True``)
    """

    # Retrieve particle name, and branch name and unit.
    #     particle, var = retrieve_particle_branch(branch)

    #     latex_branch = branchs_params[var]['name']
    #     unit = branchs_params[var]['unit']
    #     name_particle = particle_names[particle]

    latex_branch, unit = pt.get_latex_branches_units(branch)

    # Title and name of the file with BDT
    add_in_dic('fig_name', kwargs)
    add_in_dic('title', kwargs)
    add_in_dic('data_name', kwargs)

    kwargs['fig_name'] = pt._get_fig_name_given_BDT_cut(
        fig_name=kwargs['fig_name'],
        cut_BDT=cut_BDT,
        branch=branch,
        data_name=string.add_text(kwargs['data_name'], 'fit', '_', None))

    kwargs['title'] = pt._get_title_given_BDT_cut(title=kwargs['title'],
                                                  cut_BDT=cut_BDT)

    # Name of the folder = name of the data
    add_in_dic('folder_name', kwargs)

    if kwargs['folder_name'] is None and kwargs['data_name'] is not None:
        kwargs['folder_name'] = kwargs['data_name']

    return plot_hist_fit(df,
                         branch,
                         latex_branch=latex_branch,
                         unit=unit,
                         **kwargs)
Esempio n. 2
0
def _get_title_given_BDT_cut(title, cut_BDT):
    """ Return the the new title given the cut on the BDT

    Parameters
    ----------
    title       : str
        initial title
    cut_BDT     : float
        cut on the BDT (we keep ``BDT > {cut_BDT}``)

    Returns
    -------
    title: str
        new title

    Examples
    --------
    >>> _get_title_given_BDT_cut("title", -0.1)
    "title - BDT > -0.1"
    """

    # Title with BDT
    if cut_BDT is not None:
        title = string.add_text(title, f"BDT $>$ {cut_BDT}", ' - ')

    return title
Esempio n. 3
0
    def get_raw_branch(particle, raw_variable):
        """

        Parameters
        ----------
         particle: str or tuple(str) or None
            particle or list of particles
        raw_variable: str
            name of the raw variable

        Returns
        -------
        raw_branch: str or tuple(str)
            Name of the raw branch or tuple of the names of the raw branches

            * ``{raw_quantity}`` if there is one raw quantity, zero particle
            * ``raw_variable = {particle}_{raw_quantity}`` if there is one raw quantity, one particle
            * Tuple of ``tuple({raw_variable[0]},{raw_variable[1]})`` if there is two raw variables
            * ...
        """

        if assertion.is_list_tuple(raw_variable):
            particle = tuple(el_to_list(particle, len(raw_variable)))
            return tuple(RVariable.get_raw_branch(sub_particle, sub_raw_quantity)
                         for sub_particle, sub_raw_quantity in zip(particle, raw_variable))

        return add_text(particle, raw_variable)
Esempio n. 4
0
def apply_BDT(df_tot,
              df_train,
              bdt,
              BDT_name=None,
              save_BDT=False,
              kind_data='common'):
    """
    * Apply the BDT to the dataframe ``df_train`` which contains only the training variable.
    * Add the BDT output as a new variable in ``df_tot``.
    * Save ``df_tot`` in a root file ``{loc['root']}/{kind_data}_{ BDT_name}.root`` (branch ``'DecayTree'``)
    * In addition,  save the BDT output in a separated root file ``{loc['root']t/BDT_{BDT_name}.root`` (branch ``'BDT'``)
    * if ``save_BDT`` is ``True``, save the BDT in a root file ``{loc['pickle']}/bdt_{BDT_name}.pickle``

    Parameters
    ----------
    df_tot        : pandas.Dataframe
        dataframe that will be saved together with the BDT output
    df_train      : pandas.Dataframe
        dataframe with only the variables that have been used for the training
    bdt           : sklearn.ensemble.AdaBoostClassifier or sklearn.ensemble.GradientBoostingClassifier
        trained BDT classifier
    BDT_name      : str
        name of the BDT, used for the name of the saved files
    save_BDT      : bool
        if ``True``, save the BDT in a pickle file
    kind_data     : str
        name of the data where the BDT is applied to (e.g., ``'MC'``, ``'common'``, ...)
    """

    # Apply the BDT to the dataframe that contains only the variables used in
    # the training, in the right order
    df_tot['BDT'] = bdt.decision_function(df_train)

    file_name = string.add_text(kind_data, BDT_name, '_')

    df = pd.DataFrame()
    df['BDT'] = df_tot['BDT']

    save_root(df, 'BDT_' + file_name, 'DecayTree')
    save_root(df_tot, file_name, 'DecayTree')

    if save_BDT:
        dump_pickle(bdt, string.add_text('bdt', file_name, '_'))
Esempio n. 5
0
def _get_fig_name_given_BDT_cut(fig_name=None,
                                cut_BDT=None,
                                branch="",
                                data_name=None):
    """ Return the new name of the file and the new title given the cut on the BDT

    Parameters
    ----------
    fig_name   : str
        initial name of the file
    cut_BDT     : float
        cut on the BDT (we keep ``BDT > {cut_BDT}``)
    branch    : float
        a name of branch (e.g., ``'B0_M'``)
    data_name   : str or None
        name of the plotted data

    Returns
    -------
    fig_name: str
        new fig_name

    Examples
    --------
    >>> _get_title_given_BDT_cut("fig_name", -0.1, 'B0_M', 'MC')
    "fig_BDT_name-0.1"
    >>> _get_title_given_BDT_cut(None, -0.1, 'B0_M', 'MC')
    "B0_M_MC_BDT-0.1"
    """

    assert (fig_name is not None) or (data_name is not None)

    if fig_name is None:
        fig_name = string.add_text(branch, data_name, '_')

    # Title with BDT
    if cut_BDT is not None:
        fig_name = string.add_text(fig_name, f'BDT{cut_BDT}')

    return fig_name
Esempio n. 6
0
    def quantity(self):
        """ Combined physical quantity, with the function applied to it. Only defined if there is zero or one particle.

        * ``{raw_quantity}`` if there is one raw branch and no function
        * ``{raw_quantity}:{name_function}`` if there is one raw quantity and a function
        * ``{raw_quantity[0]},{raw_quantity[1]}:{name_function}`` if there are two raw variables and a function
        """
        if assertion.is_list_tuple(self.particle) and len(self.particle) > 1:
            return None
        else:
            if isinstance(self.raw_quantity, str):
                raw_quantities = [self.raw_quantity]
            else:
                raw_quantities = self.raw_quantity
            return add_text(','.join(raw_quantities),
                            self.name_function, sep=':')
Esempio n. 7
0
def classification_report_print(X_test, y_test, bdt, BDT_name=None):
    """ Test the bdt training with the testing sample.\
    Print and save the report in ``{loc['tables']}/BDT/{BDT_name}/classification_report.txt``.

    Parameters
    ----------
    X_text    : numpy.ndarray
        Array with signal and MC data concatenated and shuffled for test
    y_test    : numpy.array
        Array with 1 for the signal events, and 0 for background events (shuffled) for test
    bdt       : sklearn.ensemble.AdaBoostClassifier or sklearn.ensemble.GradientBoostingClassifier
        trained classifier
    BDT_name      : str
        name of the BDT, used for the path of the saved txt file.
    """
    #     if xgboost:
    #         y_predicted = xgbmodel.predict_proba(X)[:,1]
    #     else:
    y_predicted = bdt.predict(X_test)

    classification_report_str = classification_report(
        y_test, y_predicted, target_names=["background", "signal"])

    print(classification_report_str)
    ROC_AUC_score = roc_auc_score(
        y_test,  # real
        bdt.decision_function(X_test))
    # bdt.decision_function(X_test) = scores = returns a Numpy array, in which each element
    # represents whether a predicted sample for x_test by the classifier lies to the right
    # or left side of the Hyperplane and also how far from the HyperPlane.

    print("Area under ROC curve: %.4f" % (ROC_AUC_score))

    # Write the results -----
    fig_name = string.add_text('classification_report', BDT_name, '_')

    path = create_directory(f"{loc['tables']}/BDT/", BDT_name)
    with open(f"{path}/{fig_name}.txt", 'w') as f:
        f.write(classification_report_str)
        f.write("Area under ROC curve: %.4f" % (ROC_AUC_score))
Esempio n. 8
0
def plot_scatter2d(dfs,
                   branches,
                   latex_branches,
                   units=[None, None],
                   low=None,
                   high=None,
                   n_bins=100,
                   colors=['g', 'r', 'o', 'b'],
                   data_name=None,
                   title=None,
                   fig_name=None,
                   folder_name=None,
                   fontsize_label=default_fontsize['label'],
                   save_fig=True,
                   ax=None,
                   get_sc=False,
                   pos_text_LHC=None,
                   **params):
    """  Plot a 2D histogram of 2 branches.

    Parameters
    ----------
    dfs               : pandas.Dataframe or list(pandas.Dataframe)
        Dataset or list of datasets.
    branches          : [str, str]
        names of the two branches
    latex_branches    : [str, str]
        latex names of the two branches
    units             : str or [str, str]
        Common unit or list of two units of the two branches
    n_bins            : int or [int, int]
        number of bins
    log_scale         : bool
        if true, the colorbar is in logscale
    low               : float or [float, float]
        low  value(s) of the branches
    high              : float or [float, float]
        high value(s) of the branches
    data_name         : str
        name of the data, this is used to define the name of the figure,
        in the case ``fig_name`` is not defined, and define the legend if there is more than 1 dataframe.
    colors            : str or list(str)
        color(s) used for the histogram(s)
    title             : str
        title of the figure
    fig_name          : str
        name of the saved figure
    folder_name       : str
        name of the folder where to save the figure
    fontsize_label    : float
        fontsize of the label of the axes
    save_fig          : bool
        specifies if the figure is saved
    ax            : matplotlib.axes.Axes
        axis where to plot
    get_sc            : bool
        if True: get the scatter plot
    pos_text_LHC    : dict, list or str
        passed to :py:func:`HEA.plot.tools.set_text_LHCb` as the ``pos`` argument.

    Returns
    -------
    fig : matplotlib.figure.Figure
        Figure of the plot (only if ``ax`` is not specified)
    ax : matplotlib.figure.Axes
        Axis of the plot (only if ``ax`` is not specified)
    scs : matplotlib.PathCollection or list(matplotlib.PathCollection)
        scatter plot or list of scatter plots (only if ``get_sc`` is ``True``)
    """

    # low, high and units into a list of size 2
    low = el_to_list(low, 2)
    high = el_to_list(high, 2)

    units = el_to_list(units, 2)

    if ax is not None:
        save_fig = False

    fig, ax = get_fig_ax(ax)

    title = string.add_text(None, title, default=None)

    ax.set_title(title, fontsize=25)

    scs = [None] * len(dfs)
    for k, (data_name, df) in enumerate(dfs.items()):
        scs[k] = ax.scatter(df[branches[0]],
                            df[branches[1]],
                            c=colors[k],
                            label=data_name,
                            **params)
    if len(scs) == 1:
        scs = scs[0]

    ax.set_xlim([low[0], high[0]])
    ax.set_ylim([low[1], high[1]])

    # Label, color bar
    pt.set_label_ticks(ax)
    pt.set_text_LHCb(ax, pos=pos_text_LHC)

    set_label_2Dhist(ax, latex_branches, units, fontsize=fontsize_label)

    # Save the data
    if save_fig:
        pt.save_fig(
            fig, fig_name, folder_name,
            string.add_text(string.list_into_string(branches, '_vs_'),
                            string.list_into_string(data_name, '_'), '_'))

    if fig is not None:
        if get_sc:
            return fig, ax, scs
        else:
            return fig, ax
    else:
        if get_sc:
            return scs
Esempio n. 9
0
def plot_hist2d(df,
                branches,
                latex_branches,
                units,
                low=None,
                high=None,
                n_bins=100,
                log_scale=False,
                title=None,
                fig_name=None,
                folder_name=None,
                data_name=None,
                save_fig=True,
                ax=None,
                pos_text_LHC=None):
    """  Plot a 2D histogram of 2 branches.

    Parameters
    ----------
    df                : pandas.Dataframe
        Dataframe that contains the 2 branches to plot
    branches          : [str, str]
        names of the two branches
    latex_branches    : [str, str]
        latex names of the two branches
    units             : str or [str, str]
        Common unit or list of two units of the two branches
    n_bins            : int or [int, int]
        number of bins
    log_scale         : bool
        if true, the colorbar is in logscale
    low               : float or [float, float]
        low  value(s) of the branches
    high              : float or [float, float]
        high value(s) of the branches
    title             : str
        title of the figure
    fig_name       : str
        name of the saved figure
    folder_name     : str
        name of the folder where to save the figure
    data_name         : str
        name of the data, this is used to define the name of the figure,
        in the case ``fig_name`` is not defined.
    save_fig        : bool
        specifies if the figure is saved
    ax            : matplotlib.axes.Axes
        axis where to plot
    pos_text_LHC    : dict, list or str
        passed to :py:func:`HEA.plot.tools.set_text_LHCb` as the ``pos`` argument.

    Returns
    -------
    fig : matplotlib.figure.Figure
        Figure of the plot (only if ``ax`` is not specified)
    ax : matplotlib.figure.Axes
        Axis of the plot (only if ``ax`` is not specified)
    """

    # low, high and units into a list of size 2
    low = el_to_list(low, 2)
    high = el_to_list(high, 2)

    units = el_to_list(units, 2)

    for i in range(2):
        low[i], high[i] = pt._redefine_low_high(low[i], high[i],
                                                df[branches[i]])

    # Plotting
    fig, ax = get_fig_ax(ax)

    title = string.add_text(data_name, title, default=None)

    ax.set_title(title, fontsize=25)

    if log_scale:
        _, _, _, h = ax.hist2d(df[branches[0]],
                               df[branches[1]],
                               range=[[low[0], high[0]], [low[1], high[1]]],
                               bins=n_bins,
                               norm=LogNorm())
    else:
        _, _, _, h = ax.hist2d(df[branches[0]],
                               df[branches[1]],
                               range=[[low[0], high[0]], [low[1], high[1]]],
                               bins=n_bins)

    # Label, color bar
    pt.set_label_ticks(ax)
    pt.set_text_LHCb(ax, pos=pos_text_LHC)

    set_label_2Dhist(ax, latex_branches, units, fontsize=25)
    cbar = plt.colorbar(h)
    cbar.ax.tick_params(labelsize=20)

    return end_plot_function(fig,
                             save_fig=save_fig,
                             fig_name=fig_name,
                             folder_name=folder_name,
                             default_fig_name=string.add_text(
                                 string.list_into_string(branches, '_vs_'),
                                 data_name, '_'),
                             ax=ax)
Esempio n. 10
0
def _plot_single_model(ax,
                       x,
                       model,
                       plot_scaling,
                       model_type=None,
                       model_name=None,
                       frac=1.,
                       color='b',
                       linestyle='-',
                       line_width=2.5,
                       alpha=1):
    """ Plot the models recursively
    with a label for the curve ``"{name of the PDF (e.g., Gaussian, ...)} - {type of the model, e.g., signal ...} {Name of the model, e.g., "B0->Dst Ds"}"`` (if ``model_name`` is specified)
    ax           : matplotlib.axes.Axes
        axis where to plot
    x             : numpy.numpy(float)
        points of the x-axis where to evaluate the pdf of the model to plot
    model        : zfit.pdf.BasePDF
        just one zfit model
    plot_scaling : float
        scaling to get the scale of the curve right
    model_type  : str
        type of the model

        * ``'m'`` : model (sum) ; should always be the FIRST ONE !!
        * ``'s'`` : signal
        * ``'b'`` : background
        used in the legend to indicate if it is a signal or a background component
    model_name : str
        name of the models - used in the legend.
        If ``None``, the legend is not shown
    frac        : float
        frac is multiplied to the PDF to get the correct scale due to composite PDFs
    color      : str
        list of colors for each curve, same structure as ``models_names``
    linestyle  : str
        line style of the curve
    PDF_level   : int
        Level of the PDF:

        * 0 is first sumPDF
        * 1 if component of this sumPDF
        * 2 if component of a sumPDF component of sumPDF
        * etc.

    line_width  : float
        width of the plotted lines
    """
    assert not assertion.is_list_tuple(model)

    # Label
    if model_name is not None:
        label_model = f'{get_model_name(model)} - {model_names_types[model_type]}'
        label_model = string.add_text(label_model, model_name)
    else:
        label_model = None

    plot_fitted_curve(ax,
                      model,
                      plot_scaling,
                      frac=frac,
                      line_width=line_width,
                      color=color,
                      linestyle=linestyle,
                      label=label_model,
                      x=x,
                      alpha=alpha)
Esempio n. 11
0
def correlations(data, fig_name=None, folder_name=None, title=None, **kwds):
    """ Calculate pairwise correlation between features of the dataframe data
    and save the figure in ``{loc['plot']}/BDT/{folder_name}/corr_matrix_{fig_name}``

    Parameters
    ----------
    data         : pandas.Dataframe
        dataset
    fig_name     : str
        name of the saved file
    folder_name  : str
        name of the folder where to save the plot
    **kwds       : dict
        other plotting keyword arguments, to be passed to ``pandas.DataFrame.corr()``

    Returns
    -------
    fig : matplotlib.figure.Figure
        Figure of the plot
    ax : matplotlib.figure.Axes
        Axis of the plot
    """

    # simply call df.corr() to get a table of
    # correlation values if you do not need
    # the fancy plotting
    corrmat = data.corr(**kwds)  # correlation

    fig, ax1 = plt.subplots(ncols=1, figsize=(12, 10))  # 1 plot

    opts = {
        'cmap': plt.get_cmap("RdBu"),  # red blue color mode
        'vmin': -1,
        'vmax': +1
    }  # correlation between -1 and 1
    heatmap1 = ax1.pcolor(corrmat, **opts)  # create a pseudo color plot
    plt.colorbar(heatmap1, ax=ax1)  # color bar

    title = string.add_text("Correlations", title, ' - ')
    ax1.set_title(title)

    labels = list(corrmat.columns.values)  # get the list of labels
    for i, label in enumerate(labels):
        latex_branch, _ = RVariable.get_latex_branch_unit_from_branch(label)
        labels[i] = latex_branch
    # shift location of ticks to center of the bins
    ax1.set_xticks(np.arange(len(labels)) + 0.5, minor=False)
    ax1.set_yticks(np.arange(len(labels)) + 0.5, minor=False)
    ax1.set_xticklabels(labels, minor=False, ha='right', rotation=70)
    ax1.set_yticklabels(labels, minor=False)

    plt.tight_layout()

    if fig_name is None:
        fig_name = string.list_into_string(column)

    pt.save_fig(fig,
                f"corr_matrix_{fig_name}",
                folder_name=f'BDT/{folder_name}')

    return fig, ax1