Exemple #1
0
def plot_roc(X_test, y_test, bdt, BDT_name=None):
    """ Plot and save the roc curve in ``{loc['plots']}/BDT/{BDT_name}/ROC.pdf``

    Parameters
    ----------
    X_test        : numpy.ndarray
        signal and background concatenated, testing sample
    y_test        : numpy.array
        signal and background concatenated, testing sample,
        0 if the events is background, 1 if it is signal
    bdt           : sklearn.ensemble.AdaBoostClassifier or sklearn.ensemble.GradientBoostingClassifier
        trained BDT
    BDT_name      : str
        name of the BDT, used for the name of the saved plot
    folder_name   : str
        name of the folder where to save the BDT

    Returns
    -------
    fig : matplotlib.figure.Figure
        Figure of the plot
    ax : matplotlib.figure.Axes
        Axis of the plot
    """

    # Get the results -----
    # result of the BDT of the test sample
    decisions = bdt.decision_function(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, decisions)  # roc_curve
    # y_test: true results
    # decisions: result found by the BDT
    # fpr: Increasing false positive rates such that element i is the false positive rate of predictions with score >= thresholds[i].
    # tpr: Increasing true positive rates such that element i is the true positive rate of predictions with score >= thresholds[i].
    # thresholds: Decreasing thresholds on the decision function used to
    # compute fpr and tpr. thresholds[0] represents no instances being
    # predicted and is arbitrarily set to max(y_score) + 1
    fig, ax = plt.subplots(figsize=(8, 6))
    roc_auc = auc(fpr, tpr)

    # Plot the results -----
    ax.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc))
    ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([-0.05, 1.05])
    ax.set_xlabel('False Positive Rate', fontsize=25)
    ax.set_ylabel('True Positive Rate', fontsize=25)
    title = 'Receiver operating characteristic'

    ax.legend(loc="lower right", fontsize=20.)
    pt.show_grid(ax)
    pt.fix_plot(ax,
                factor_ymax=1.1,
                show_leg=False,
                fontsize_ticks=20.,
                ymin_to_0=False)
    # Save the results -----

    pt.save_fig(fig, "ROC", folder_name=f'BDT/{BDT_name}')

    return fig, ax
Exemple #2
0
def plot_divide(dfs,
                branch,
                latex_branch,
                unit,
                low=None,
                high=None,
                n_bins=100,
                fig_name=None,
                folder_name=None,
                save_fig=True,
                ax=None,
                pos_text_LHC=None):
    """ plot the (histogram of the dataframe 1 of branch)/(histogram of the dataframe 1 of branch) after normalisation

    Parameters
    ----------
    dfs             : dict(str:pandas.Dataframe)
        Dictionnary {name of the dataframe : pandas dataframe}
    branch          : str
        name of the branch in the dataframe
    latex_branch    : str
        Latex name of the branch (for the labels of the plot)
    unit            : str
        Unit of the physical quantity
    low             : float
        low value of the distribution
    high            : float
        high value of the distribution
    n_bins          : int
        Desired number of bins of the histogram
    fig_name       : str
        name of the saved figure
    folder_name     : str
        name of the folder where to save the figure
    save_fig        : bool
        specifies if the figure is saved
    ax            : matplotlib.axes.Axes
        axis where to plot
    pos_text_LHC    : dict, list or str
        passed to :py:func:`HEA.plot.tools.set_text_LHCb` as the ``pos`` argument.

    Returns
    -------
    fig : matplotlib.figure.Figure
        Figure of the plot (only if ``ax`` is not specified)
    ax : matplotlib.figure.Axes
        Axis of the plot (only if ``ax`` is not specified)
    """

    fig, ax = get_fig_ax(ax)
    data_names = list(dfs.keys())

    # Compute the number of bins
    low, high = pt._redefine_low_high(low, high,
                                      [df[branch] for df in dfs.values()])
    bin_width = get_bin_width(low, high, n_bins)

    # Make the histogram, and get the bin centres and error on the counts in
    # each bin
    list_dfs = list(dfs.values())
    names_data = list(dfs.keys())

    counts1, bin_edges = np.histogram(list_dfs[0][branch],
                                      n_bins,
                                      range=(low, high))
    counts2, _ = np.histogram(list_dfs[1][branch], n_bins, range=(low, high))
    bin_centres = (bin_edges[:-1] + bin_edges[1:]) / 2.

    err1 = np.sqrt(counts1)
    err2 = np.sqrt(counts2)

    # division
    with np.errstate(divide='ignore', invalid='ignore'):
        division = counts1 * counts2.sum() / (counts2 * counts1.sum())
    err = division * np.sqrt((err1 / counts1)**2 + (err2 / counts2)**2)

    ax.errorbar(bin_centres, division, yerr=err, fmt='o', color='k')
    ax.plot([low, high], [1., 1.], linestyle='--', color='b', marker='')

    # Labels
    set_label_divided_hist(ax,
                           latex_branch,
                           unit,
                           bin_width,
                           names_data,
                           fontsize=25)

    # Set lower and upper range of the x and y axes
    pt.fix_plot(ax,
                factor_ymax=1.1,
                show_leg=False,
                fontsize_ticks=20.,
                ymin_to0=False,
                pos_text_LHC=pos_text_LHC)

    # Save
    return end_plot_function(
        fig,
        save_fig=save_fig,
        fig_name=fig_name,
        folder_name=folder_name,
        default_fig_name=
        f"{branch.replace('/','d')}_{string.list_into_string(data_names,'_d_')}",
        ax=ax)
Exemple #3
0
def plot_hist(dfs,
              branch,
              latex_branch=None,
              unit=None,
              weights=None,
              low=None,
              high=None,
              n_bins=100,
              colors=None,
              alpha=None,
              bar_mode=False,
              density=None,
              orientation='vertical',
              title=None,
              pos_text_LHC=None,
              fig_name=None,
              folder_name=None,
              fontsize_label=default_fontsize['label'],
              save_fig=True,
              ax=None,
              factor_ymax=None,
              show_leg=None,
              loc_leg='best',
              **params):
    """ Save the histogram(s) of branch of the data given in ``dfs``

    Parameters
    ----------
    dfs             : dict(str:pandas.Dataframe)
        Dictionnary {name of the dataframe : pandas dataframe}
    branch          : str
        name of the branch in the dataframe
    latex_branch    : str
        Latex name of the branch (for the labels of the plot)
    unit            : str
        Unit of the physical quantity
    weights         : numpy.array
        weights passed to plt.hist
    low             : float
        low value of the distribution
    high            : float
        high value of the distribution
    n_bins          : int
        Desired number of bins of the histogram
    colors          : str or list(str)
        color(s) used for the histogram(s)
    alpha           : str or list(str)
        transparancy(ies) of the histograms
    bar_mode       : bool
        if True, plot with bars, else, plot with points and error bars
    density         : bool
        if True, divide the numbers of counts in the histogram by the total number of counts
    orientation     : 'vertical' or 'horizontal'
        orientation of the histogram
    title           : str
        title of the figure to show at the top of the figure
    pos_text_LHC    : dict, list or str
        passed to :py:func:`HEA.plot.tools.set_text_LHCb` as the ``pos`` argument.
    fig_name       : str
        name of the saved figure
    folder_name     : str
        name of the folder where to save the figure
    fontsize_label  : float
        fontsize of the label
    save_fig        : bool
        specifies if the figure is saved
    factor_ymax     : float
        multiplicative factor of ymax
    ax            : matplotlib.axes.Axes
        axis where to plot
    show_leg        : bool
        True if the legend needs to be shown
    loc_leg         : str
        location of the legend
    **params       : dict
        passed to :py:func:`plot_hist_alone`

    Returns
    -------
    fig : matplotlib.figure.Figure
        Figure of the plot (only if ``ax`` is not specified)
    ax : matplotlib.figure.Axes
        Axis of the plot (only if ``ax`` is not specified)
    """
    if not isinstance(dfs, dict):
        dfs = {"": dfs}

    if density is None:
        density = len(dfs) > 1  # if there are more than 2 histograms

    fig, ax = get_fig_ax(ax, orientation)

    if isinstance(dfs, dict):
        data_names = list(dfs.keys())

    if latex_branch is None:
        latex_branch = string._latex_format(branch)

    ax.set_title(title, fontsize=fontsize_label)

    # First loop to determine the low and high value
    low, high = pt._redefine_low_high(low, high,
                                      [df[branch] for df in dfs.values()])
    bin_width = get_bin_width(low, high, n_bins)

    # colors
    if colors is None:
        colors = ['r', 'b', 'g', 'k']
    if not isinstance(colors, list):
        colors = [colors]

    weights = el_to_list(weights, len(dfs))
    alpha = el_to_list(alpha, len(dfs))

    for i, (data_name, df) in enumerate(dfs.items()):
        if alpha[i] is None:
            alpha[i] = 0.5 if len(dfs) > 1 else 1
        _, _, _, _ = plot_hist_alone(ax,
                                     df[branch],
                                     n_bins,
                                     low,
                                     high,
                                     colors[i],
                                     bar_mode,
                                     alpha=alpha[i],
                                     density=density,
                                     label=data_name,
                                     weights=weights[i],
                                     orientation=orientation,
                                     **params)

    # Some plot style stuff
    if factor_ymax is None:
        factor_ymax = 1 + 0.15 * len(data_names)

    if show_leg is None:
        show_leg = len(dfs) > 1

    set_label_hist(ax,
                   latex_branch,
                   unit,
                   bin_width,
                   density=density,
                   fontsize=fontsize_label,
                   orientation=orientation)

    if orientation == 'vertical':
        axis_y = 'y'
    elif orientation == 'horizontal':
        axis_y = 'x'
    pt.fix_plot(ax,
                factor_ymax=factor_ymax,
                show_leg=show_leg,
                pos_text_LHC=pos_text_LHC,
                loc_leg=loc_leg,
                axis=axis_y)

    return end_plot_function(
        fig,
        save_fig=save_fig,
        fig_name=fig_name,
        folder_name=folder_name,
        default_fig_name=f'{branch}_{string.list_into_string(data_names)}',
        ax=ax)
Exemple #4
0
def compare_train_test(bdt,
                       X_train,
                       y_train,
                       X_test,
                       y_test,
                       bins=30,
                       BDT_name="",
                       colors=['red', 'green']):
    """ Plot and save the overtraining plot in ``{loc['plots']}/BDT/{folder_name}/overtraining_{BDT_name}.pdf``

    Parameters
    ----------
    bdt           : sklearn.ensemble.AdaBoostClassifier or sklearn.ensemble.GradientBoostingClassifier
        trained BDT classifier
    X_train : numpy.ndarray
        Array with signal and MC data concatenated and shuffled for training
    y_train : numpy.array
        Array with 1 for the signal events, and 0 for background events (shuffled) for training
    X_text  : numpy.ndarray
        Array with signal and MC data concatenated and shuffled for test
    y_test  : numpy.array
        Array with 1 for the signal events, and 0 for background events (shuffled) for test
    bins          : int
        number of bins of the plotted histograms
    BDT_name      : str
        name of the BDT, used for the folder where the figure is saved

    Returns
    -------
    fig              : matplotlib.figure.Figure
        Figure of the plot
    ax               : matplotlib.figure.Axes
        Axis of the plot
    s_2samp_sig      : float
        Kolmogorov-Smirnov statistic for the signal distributions
    ks_2samp_bkg     : float
        Kolmogorov-Smirnov statistic for the background distributions
    pvalue_2samp_sig : float
        p-value of the Kolmogorov-Smirnov test for the signal distributions
    pvalue_2samp_bkg : float
        p-value of the Kolmogorov-Smirnov test for the background distributions
    """
    fig, ax = plt.subplots(figsize=(8, 6))

    ## decisions = [d(X_train_signal), d(X_train_background),d(X_test_signal), d(X_test_background)]
    decisions = []
    for X, y in ((X_train, y_train), (X_test, y_test)):
        d1 = bdt.decision_function(X[y > 0.5]).ravel()
        d2 = bdt.decision_function(X[y < 0.5]).ravel()
        decisions += [d1, d2]  # [signal, background]
    '''
    decisions[0]: train, background
    decisions[1]: train, signal
    decisions[2]: test, background
    decisions[3]: test, signal
    '''

    # Range of the full plot
    low = min(np.min(d) for d in decisions)
    high = max(np.max(d) for d in decisions)
    low_high = (low, high)

    # Plot for the train data the stepfilled histogram of background (y<0.5)
    # and signal (y>0.5)
    ax.hist(decisions[0],
            color=colors[0],
            alpha=0.5,
            range=low_high,
            bins=bins,
            histtype='stepfilled',
            density=True,
            label='S (train)')
    ax.hist(decisions[1],
            color=colors[1],
            alpha=0.5,
            range=low_high,
            bins=bins,
            histtype='stepfilled',
            density=True,
            label='B (train)')

    # Plot for the test data the points with uncertainty of background (y<0.5)
    # and signal (y>0.5)
    hist, bins = np.histogram(decisions[2],
                              bins=bins,
                              range=low_high,
                              density=True)
    scale = len(decisions[2]) / sum(hist)
    # Compute and rescale the error
    err = np.sqrt(hist * scale) / scale

    width = (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    ax.errorbar(center, hist, yerr=err, fmt='o', c=colors[0], label='S (test)')

    hist, bins = np.histogram(decisions[3],
                              bins=bins,
                              range=low_high,
                              density=True)
    # Compute and rescale the error
    scale = len(decisions[2]) / sum(hist)
    err = np.sqrt(hist * scale) / scale

    ax.errorbar(center, hist, yerr=err, fmt='o', c=colors[1], label='B (test)')

    ax.set_xlabel("BDT output", fontsize=25.)
    ax.set_ylabel("Arbitrary units", fontsize=25.)
    ax.legend(loc='best', fontsize=20.)
    pt.show_grid(ax)

    pt.fix_plot(ax,
                factor_ymax=1.1,
                show_leg=False,
                fontsize_ticks=20.,
                ymin_to_0=False)

    pt.save_fig(fig, "overtraining", folder_name=f'BDT/{BDT_name}')

    ks_2samp_sig = ks_2samp(decisions[0], decisions[2]).statistic
    ks_2samp_bkg = ks_2samp(decisions[1], decisions[3]).statistic
    pvalue_2samp_sig = ks_2samp(decisions[0], decisions[2]).pvalue
    pvalue_2samp_bkg = ks_2samp(decisions[1], decisions[3]).pvalue
    print('Kolmogorov-Smirnov statistic')
    print(f"signal    : {ks_2samp_sig}")
    print(f"Background: {ks_2samp_bkg}")

    print('p-value')
    print(f"signal    : {pvalue_2samp_sig}")
    print(f"Background: {pvalue_2samp_bkg}")
    return fig, ax, ks_2samp_sig, ks_2samp_bkg, pvalue_2samp_sig, pvalue_2samp_bkg
Exemple #5
0
def signal_background(data1,
                      data2,
                      column=None,
                      range_column=None,
                      grid=True,
                      xlabelsize=None,
                      ylabelsize=None,
                      sharex=False,
                      sharey=False,
                      figsize=None,
                      layout=None,
                      n_bins=40,
                      fig_name=None,
                      folder_name=None,
                      colors=['red', 'green'],
                      **kwds):
    """Draw histogram of the DataFrame's series comparing the distribution
    in ``data1`` to ``data2`` and save the result in
    ``{loc['plot']}/BDT/{folder_name}/1D_hist_{fig_name}``

    Parameters
    ----------
    data1        : pandas.Dataframe
        First dataset
    data2        : pandas.Dataframe
        Second dataset
    column       : str or list(str)
        If passed, will be used to limit data to a subset of columns
    grid         : bool
        Whether to show axis grid lines
    xlabelsize   : int
        if specified changes the x-axis label size
    ylabelsize   : int
        if specified changes the y-axis label size
    ax           : matplotlib.axes.Axes
    sharex       : bool
        if ``True``, the X axis will be shared amongst all subplots.
    sharey       : bool
        if ``True``, the Y axis will be shared amongst all subplots.
    figsize      : tuple
        the size of the figure to create in inches by default
    bins         : int,
        number of histogram bins to be used
    fig_name    : str
        name of the saved file
    folder_name  : str
        name of the folder where to save the plot
    colors       : [str, str]
        colors used for the two datasets
    **kwds       : dict
        other plotting keyword arguments, to be passed to the `ax.hist()` function

    Returns
    -------
    fig : matplotlib.figure.Figure
        Figure of the plot
    ax : matplotlib.figure.Axes
        Axis of the plot
    """
    if 'alpha' not in kwds:
        kwds['alpha'] = 0.5

    if column is not None:
        # column is not a list, convert it into a list.
        if not isinstance(column, (list, np.ndarray, Index)):
            column = [column]
        data1 = data1[column]
        data2 = data2[column]

    data1 = data1._get_numeric_data()  # select only numbers
    data2 = data2._get_numeric_data()  # seject only numbers
    naxes = len(data1.columns)  # number of axes = number of available columns

    max_nrows = 4
    # subplots
    fig, axes = plt.subplots(nrows=min(naxes, max_nrows),
                             ncols=1 + naxes // max_nrows,
                             squeeze=False,
                             sharex=sharex,
                             sharey=sharey,
                             figsize=figsize)

    _axes = axes.flat

    if range_column is None:
        range_column = [[None, None] for i in range(len(column))]
    # data.columns = the column labels of the DataFrame.
    for i, col in enumerate(data1.columns):
        # col = name of the column/variable
        ax = _axes[i]

        if range_column[i] is None:
            range_column[i] = [None, None]
        if range_column[i][0] is None:
            low = min(data1[col].min(), data2[col].min())
        else:
            low = range_column[i][0]
        if range_column[i][1] is None:
            high = max(data1[col].max(), data2[col].max())
        else:
            high = range_column[i][1]

        low, high = pt.redefine_low_high(range_column[i][0],
                                         range_column[i][1],
                                         [data1[col], data2[col]])
        _, _, _, _ = h.plot_hist_alone(ax,
                                       data1[col].dropna().values,
                                       n_bins,
                                       low,
                                       high,
                                       colors[1],
                                       mode_hist=True,
                                       alpha=0.5,
                                       density=True,
                                       label='background',
                                       label_ncounts=True)
        _, _, _, _ = h.plot_hist_alone(ax,
                                       data2[col].dropna().values,
                                       n_bins,
                                       low,
                                       high,
                                       colors[0],
                                       mode_hist=True,
                                       alpha=0.5,
                                       density=True,
                                       label='signal',
                                       label_ncounts=True)

        bin_width = (high - low) / n_bins
        latex_branch, unit = RVariable.get_latex_branch_unit_from_branch(col)
        h.set_label_hist(ax,
                         latex_branch,
                         unit,
                         bin_width=bin_width,
                         density=False,
                         fontsize=20)
        pt.fix_plot(ax,
                    factor_ymax=1 + 0.3,
                    show_leg=True,
                    fontsize_ticks=15.,
                    fontsize_leg=20.)
        pt.show_grid(ax, which='major')

    i += 1
    while i < len(_axes):
        ax = _axes[i]
        ax.axis('off')
        i += 1

    #fig.subplots_adjust(wspace=0.3, hspace=0.7)
    if fig_name is None:
        fig_name = string.list_into_string(column)

    plt.tight_layout()
    pt.save_fig(fig, f"1D_hist_{fig_name}", folder_name=f'BDT/{folder_name}')

    return fig, axes
Exemple #6
0
def plot_xys(ax,
             x,
             ly,
             xlabel,
             labels=None,
             colors=['b', 'g', 'r', 'y'],
             fontsize=default_fontsize['label'],
             markersize=1,
             linewidth=1.,
             linestyle='-',
             factor_ymax=1.,
             marker='.',
             elinewidth=None,
             annotations=None,
             fontsize_annot=default_fontsize['annotation'],
             space_x=-15,
             space_y=5,
             pos_text_LHC=None):
    """ Plot the curve(s) in `ly` as a function of `x`, with `annotations`.

    Parameters
    ----------
    ax               : matplotlib.axes.Axes
        axis where to plot
    x                : list(float)
        abcissa of the points
    ly               : list(list(float))
        list of the ordinates of the points of the curves
    labels           : list(str)
        labels of the curves
    xlabel           : str
        label of the x-axis
    labels           : str
        label of the curves
    colors           : list(str)
        colors of each curve
    fontsize         : float
        fontsize of the labels
    markersize       : float
        size of the markers
    linewidth        : float
        linewidth of the plotted curves
    linestyle        : str
        linestyle of the plotted curves
    factor_ymax      : float
        multiplicative factor of ymax
    marker           : str
        marker style
    elinewidth       : float
        width of the error bars
    annotations      : list(str)
        list of the labels of the points - only if there is one curve (i.e., ``len(ly)==1``)
    fontsize_annot   : float
        fontsize of the annotations
    space_x          : float
        space in pixel from the point to the annotation text, projected in the x-axis
    space_y          : float
        space in pixel from the point to the annotation text, projected in the y-axis
    pos_text_LHC    : dict, list or str
        passed to :py:func:`HEA.plot.tools.set_text_LHCb` as the ``pos`` argument.
    """
    colors = el_to_list(colors, len(ly))

    plot_legend = False

    for i, y in enumerate(ly):
        label = labels[i] if len(ly) > 1 else None
        x = np.array(x)
        y = np.array(y)
        x_n = unumpy.nominal_values(x)
        y_n = unumpy.nominal_values(y)
        ax.errorbar(x_n,
                    y_n,
                    xerr=unumpy.std_devs(x),
                    yerr=unumpy.std_devs(y),
                    linestyle=linestyle,
                    color=colors[i],
                    markersize=markersize,
                    elinewidth=elinewidth,
                    linewidth=linewidth,
                    label=label,
                    marker=marker)

        if label is not None:
            plot_legend = True

    ax.set_xlabel(xlabel, fontsize=fontsize)

    if len(ly) == 1:
        ax.set_ylabel(labels[0], fontsize=fontsize)
    else:
        ax.set_ylabel('value', fontsize=fontsize)

    # Grid
    pt.show_grid(ax, which='major')
    pt.show_grid(ax, which='minor')

    # Ticks
    pt.fix_plot(ax,
                factor_ymax=factor_ymax,
                show_leg=plot_legend,
                fontsize_leg=25,
                ymin_to_0=False,
                pos_text_LHC=pos_text_LHC)

    if annotations is not None:
        assert len(ly) == 1
        add_value_labels(ax,
                         x_n,
                         y_n,
                         annotations,
                         labelsize=fontsize_annot,
                         space_x=space_x,
                         space_y=space_y)