def plot_roc(X_test, y_test, bdt, BDT_name=None): """ Plot and save the roc curve in ``{loc['plots']}/BDT/{BDT_name}/ROC.pdf`` Parameters ---------- X_test : numpy.ndarray signal and background concatenated, testing sample y_test : numpy.array signal and background concatenated, testing sample, 0 if the events is background, 1 if it is signal bdt : sklearn.ensemble.AdaBoostClassifier or sklearn.ensemble.GradientBoostingClassifier trained BDT BDT_name : str name of the BDT, used for the name of the saved plot folder_name : str name of the folder where to save the BDT Returns ------- fig : matplotlib.figure.Figure Figure of the plot ax : matplotlib.figure.Axes Axis of the plot """ # Get the results ----- # result of the BDT of the test sample decisions = bdt.decision_function(X_test) fpr, tpr, thresholds = roc_curve(y_test, decisions) # roc_curve # y_test: true results # decisions: result found by the BDT # fpr: Increasing false positive rates such that element i is the false positive rate of predictions with score >= thresholds[i]. # tpr: Increasing true positive rates such that element i is the true positive rate of predictions with score >= thresholds[i]. # thresholds: Decreasing thresholds on the decision function used to # compute fpr and tpr. thresholds[0] represents no instances being # predicted and is arbitrarily set to max(y_score) + 1 fig, ax = plt.subplots(figsize=(8, 6)) roc_auc = auc(fpr, tpr) # Plot the results ----- ax.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc)) ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') ax.set_xlim([-0.05, 1.05]) ax.set_ylim([-0.05, 1.05]) ax.set_xlabel('False Positive Rate', fontsize=25) ax.set_ylabel('True Positive Rate', fontsize=25) title = 'Receiver operating characteristic' ax.legend(loc="lower right", fontsize=20.) pt.show_grid(ax) pt.fix_plot(ax, factor_ymax=1.1, show_leg=False, fontsize_ticks=20., ymin_to_0=False) # Save the results ----- pt.save_fig(fig, "ROC", folder_name=f'BDT/{BDT_name}') return fig, ax
def plot_divide(dfs, branch, latex_branch, unit, low=None, high=None, n_bins=100, fig_name=None, folder_name=None, save_fig=True, ax=None, pos_text_LHC=None): """ plot the (histogram of the dataframe 1 of branch)/(histogram of the dataframe 1 of branch) after normalisation Parameters ---------- dfs : dict(str:pandas.Dataframe) Dictionnary {name of the dataframe : pandas dataframe} branch : str name of the branch in the dataframe latex_branch : str Latex name of the branch (for the labels of the plot) unit : str Unit of the physical quantity low : float low value of the distribution high : float high value of the distribution n_bins : int Desired number of bins of the histogram fig_name : str name of the saved figure folder_name : str name of the folder where to save the figure save_fig : bool specifies if the figure is saved ax : matplotlib.axes.Axes axis where to plot pos_text_LHC : dict, list or str passed to :py:func:`HEA.plot.tools.set_text_LHCb` as the ``pos`` argument. Returns ------- fig : matplotlib.figure.Figure Figure of the plot (only if ``ax`` is not specified) ax : matplotlib.figure.Axes Axis of the plot (only if ``ax`` is not specified) """ fig, ax = get_fig_ax(ax) data_names = list(dfs.keys()) # Compute the number of bins low, high = pt._redefine_low_high(low, high, [df[branch] for df in dfs.values()]) bin_width = get_bin_width(low, high, n_bins) # Make the histogram, and get the bin centres and error on the counts in # each bin list_dfs = list(dfs.values()) names_data = list(dfs.keys()) counts1, bin_edges = np.histogram(list_dfs[0][branch], n_bins, range=(low, high)) counts2, _ = np.histogram(list_dfs[1][branch], n_bins, range=(low, high)) bin_centres = (bin_edges[:-1] + bin_edges[1:]) / 2. err1 = np.sqrt(counts1) err2 = np.sqrt(counts2) # division with np.errstate(divide='ignore', invalid='ignore'): division = counts1 * counts2.sum() / (counts2 * counts1.sum()) err = division * np.sqrt((err1 / counts1)**2 + (err2 / counts2)**2) ax.errorbar(bin_centres, division, yerr=err, fmt='o', color='k') ax.plot([low, high], [1., 1.], linestyle='--', color='b', marker='') # Labels set_label_divided_hist(ax, latex_branch, unit, bin_width, names_data, fontsize=25) # Set lower and upper range of the x and y axes pt.fix_plot(ax, factor_ymax=1.1, show_leg=False, fontsize_ticks=20., ymin_to0=False, pos_text_LHC=pos_text_LHC) # Save return end_plot_function( fig, save_fig=save_fig, fig_name=fig_name, folder_name=folder_name, default_fig_name= f"{branch.replace('/','d')}_{string.list_into_string(data_names,'_d_')}", ax=ax)
def plot_hist(dfs, branch, latex_branch=None, unit=None, weights=None, low=None, high=None, n_bins=100, colors=None, alpha=None, bar_mode=False, density=None, orientation='vertical', title=None, pos_text_LHC=None, fig_name=None, folder_name=None, fontsize_label=default_fontsize['label'], save_fig=True, ax=None, factor_ymax=None, show_leg=None, loc_leg='best', **params): """ Save the histogram(s) of branch of the data given in ``dfs`` Parameters ---------- dfs : dict(str:pandas.Dataframe) Dictionnary {name of the dataframe : pandas dataframe} branch : str name of the branch in the dataframe latex_branch : str Latex name of the branch (for the labels of the plot) unit : str Unit of the physical quantity weights : numpy.array weights passed to plt.hist low : float low value of the distribution high : float high value of the distribution n_bins : int Desired number of bins of the histogram colors : str or list(str) color(s) used for the histogram(s) alpha : str or list(str) transparancy(ies) of the histograms bar_mode : bool if True, plot with bars, else, plot with points and error bars density : bool if True, divide the numbers of counts in the histogram by the total number of counts orientation : 'vertical' or 'horizontal' orientation of the histogram title : str title of the figure to show at the top of the figure pos_text_LHC : dict, list or str passed to :py:func:`HEA.plot.tools.set_text_LHCb` as the ``pos`` argument. fig_name : str name of the saved figure folder_name : str name of the folder where to save the figure fontsize_label : float fontsize of the label save_fig : bool specifies if the figure is saved factor_ymax : float multiplicative factor of ymax ax : matplotlib.axes.Axes axis where to plot show_leg : bool True if the legend needs to be shown loc_leg : str location of the legend **params : dict passed to :py:func:`plot_hist_alone` Returns ------- fig : matplotlib.figure.Figure Figure of the plot (only if ``ax`` is not specified) ax : matplotlib.figure.Axes Axis of the plot (only if ``ax`` is not specified) """ if not isinstance(dfs, dict): dfs = {"": dfs} if density is None: density = len(dfs) > 1 # if there are more than 2 histograms fig, ax = get_fig_ax(ax, orientation) if isinstance(dfs, dict): data_names = list(dfs.keys()) if latex_branch is None: latex_branch = string._latex_format(branch) ax.set_title(title, fontsize=fontsize_label) # First loop to determine the low and high value low, high = pt._redefine_low_high(low, high, [df[branch] for df in dfs.values()]) bin_width = get_bin_width(low, high, n_bins) # colors if colors is None: colors = ['r', 'b', 'g', 'k'] if not isinstance(colors, list): colors = [colors] weights = el_to_list(weights, len(dfs)) alpha = el_to_list(alpha, len(dfs)) for i, (data_name, df) in enumerate(dfs.items()): if alpha[i] is None: alpha[i] = 0.5 if len(dfs) > 1 else 1 _, _, _, _ = plot_hist_alone(ax, df[branch], n_bins, low, high, colors[i], bar_mode, alpha=alpha[i], density=density, label=data_name, weights=weights[i], orientation=orientation, **params) # Some plot style stuff if factor_ymax is None: factor_ymax = 1 + 0.15 * len(data_names) if show_leg is None: show_leg = len(dfs) > 1 set_label_hist(ax, latex_branch, unit, bin_width, density=density, fontsize=fontsize_label, orientation=orientation) if orientation == 'vertical': axis_y = 'y' elif orientation == 'horizontal': axis_y = 'x' pt.fix_plot(ax, factor_ymax=factor_ymax, show_leg=show_leg, pos_text_LHC=pos_text_LHC, loc_leg=loc_leg, axis=axis_y) return end_plot_function( fig, save_fig=save_fig, fig_name=fig_name, folder_name=folder_name, default_fig_name=f'{branch}_{string.list_into_string(data_names)}', ax=ax)
def compare_train_test(bdt, X_train, y_train, X_test, y_test, bins=30, BDT_name="", colors=['red', 'green']): """ Plot and save the overtraining plot in ``{loc['plots']}/BDT/{folder_name}/overtraining_{BDT_name}.pdf`` Parameters ---------- bdt : sklearn.ensemble.AdaBoostClassifier or sklearn.ensemble.GradientBoostingClassifier trained BDT classifier X_train : numpy.ndarray Array with signal and MC data concatenated and shuffled for training y_train : numpy.array Array with 1 for the signal events, and 0 for background events (shuffled) for training X_text : numpy.ndarray Array with signal and MC data concatenated and shuffled for test y_test : numpy.array Array with 1 for the signal events, and 0 for background events (shuffled) for test bins : int number of bins of the plotted histograms BDT_name : str name of the BDT, used for the folder where the figure is saved Returns ------- fig : matplotlib.figure.Figure Figure of the plot ax : matplotlib.figure.Axes Axis of the plot s_2samp_sig : float Kolmogorov-Smirnov statistic for the signal distributions ks_2samp_bkg : float Kolmogorov-Smirnov statistic for the background distributions pvalue_2samp_sig : float p-value of the Kolmogorov-Smirnov test for the signal distributions pvalue_2samp_bkg : float p-value of the Kolmogorov-Smirnov test for the background distributions """ fig, ax = plt.subplots(figsize=(8, 6)) ## decisions = [d(X_train_signal), d(X_train_background),d(X_test_signal), d(X_test_background)] decisions = [] for X, y in ((X_train, y_train), (X_test, y_test)): d1 = bdt.decision_function(X[y > 0.5]).ravel() d2 = bdt.decision_function(X[y < 0.5]).ravel() decisions += [d1, d2] # [signal, background] ''' decisions[0]: train, background decisions[1]: train, signal decisions[2]: test, background decisions[3]: test, signal ''' # Range of the full plot low = min(np.min(d) for d in decisions) high = max(np.max(d) for d in decisions) low_high = (low, high) # Plot for the train data the stepfilled histogram of background (y<0.5) # and signal (y>0.5) ax.hist(decisions[0], color=colors[0], alpha=0.5, range=low_high, bins=bins, histtype='stepfilled', density=True, label='S (train)') ax.hist(decisions[1], color=colors[1], alpha=0.5, range=low_high, bins=bins, histtype='stepfilled', density=True, label='B (train)') # Plot for the test data the points with uncertainty of background (y<0.5) # and signal (y>0.5) hist, bins = np.histogram(decisions[2], bins=bins, range=low_high, density=True) scale = len(decisions[2]) / sum(hist) # Compute and rescale the error err = np.sqrt(hist * scale) / scale width = (bins[1] - bins[0]) center = (bins[:-1] + bins[1:]) / 2 ax.errorbar(center, hist, yerr=err, fmt='o', c=colors[0], label='S (test)') hist, bins = np.histogram(decisions[3], bins=bins, range=low_high, density=True) # Compute and rescale the error scale = len(decisions[2]) / sum(hist) err = np.sqrt(hist * scale) / scale ax.errorbar(center, hist, yerr=err, fmt='o', c=colors[1], label='B (test)') ax.set_xlabel("BDT output", fontsize=25.) ax.set_ylabel("Arbitrary units", fontsize=25.) ax.legend(loc='best', fontsize=20.) pt.show_grid(ax) pt.fix_plot(ax, factor_ymax=1.1, show_leg=False, fontsize_ticks=20., ymin_to_0=False) pt.save_fig(fig, "overtraining", folder_name=f'BDT/{BDT_name}') ks_2samp_sig = ks_2samp(decisions[0], decisions[2]).statistic ks_2samp_bkg = ks_2samp(decisions[1], decisions[3]).statistic pvalue_2samp_sig = ks_2samp(decisions[0], decisions[2]).pvalue pvalue_2samp_bkg = ks_2samp(decisions[1], decisions[3]).pvalue print('Kolmogorov-Smirnov statistic') print(f"signal : {ks_2samp_sig}") print(f"Background: {ks_2samp_bkg}") print('p-value') print(f"signal : {pvalue_2samp_sig}") print(f"Background: {pvalue_2samp_bkg}") return fig, ax, ks_2samp_sig, ks_2samp_bkg, pvalue_2samp_sig, pvalue_2samp_bkg
def signal_background(data1, data2, column=None, range_column=None, grid=True, xlabelsize=None, ylabelsize=None, sharex=False, sharey=False, figsize=None, layout=None, n_bins=40, fig_name=None, folder_name=None, colors=['red', 'green'], **kwds): """Draw histogram of the DataFrame's series comparing the distribution in ``data1`` to ``data2`` and save the result in ``{loc['plot']}/BDT/{folder_name}/1D_hist_{fig_name}`` Parameters ---------- data1 : pandas.Dataframe First dataset data2 : pandas.Dataframe Second dataset column : str or list(str) If passed, will be used to limit data to a subset of columns grid : bool Whether to show axis grid lines xlabelsize : int if specified changes the x-axis label size ylabelsize : int if specified changes the y-axis label size ax : matplotlib.axes.Axes sharex : bool if ``True``, the X axis will be shared amongst all subplots. sharey : bool if ``True``, the Y axis will be shared amongst all subplots. figsize : tuple the size of the figure to create in inches by default bins : int, number of histogram bins to be used fig_name : str name of the saved file folder_name : str name of the folder where to save the plot colors : [str, str] colors used for the two datasets **kwds : dict other plotting keyword arguments, to be passed to the `ax.hist()` function Returns ------- fig : matplotlib.figure.Figure Figure of the plot ax : matplotlib.figure.Axes Axis of the plot """ if 'alpha' not in kwds: kwds['alpha'] = 0.5 if column is not None: # column is not a list, convert it into a list. if not isinstance(column, (list, np.ndarray, Index)): column = [column] data1 = data1[column] data2 = data2[column] data1 = data1._get_numeric_data() # select only numbers data2 = data2._get_numeric_data() # seject only numbers naxes = len(data1.columns) # number of axes = number of available columns max_nrows = 4 # subplots fig, axes = plt.subplots(nrows=min(naxes, max_nrows), ncols=1 + naxes // max_nrows, squeeze=False, sharex=sharex, sharey=sharey, figsize=figsize) _axes = axes.flat if range_column is None: range_column = [[None, None] for i in range(len(column))] # data.columns = the column labels of the DataFrame. for i, col in enumerate(data1.columns): # col = name of the column/variable ax = _axes[i] if range_column[i] is None: range_column[i] = [None, None] if range_column[i][0] is None: low = min(data1[col].min(), data2[col].min()) else: low = range_column[i][0] if range_column[i][1] is None: high = max(data1[col].max(), data2[col].max()) else: high = range_column[i][1] low, high = pt.redefine_low_high(range_column[i][0], range_column[i][1], [data1[col], data2[col]]) _, _, _, _ = h.plot_hist_alone(ax, data1[col].dropna().values, n_bins, low, high, colors[1], mode_hist=True, alpha=0.5, density=True, label='background', label_ncounts=True) _, _, _, _ = h.plot_hist_alone(ax, data2[col].dropna().values, n_bins, low, high, colors[0], mode_hist=True, alpha=0.5, density=True, label='signal', label_ncounts=True) bin_width = (high - low) / n_bins latex_branch, unit = RVariable.get_latex_branch_unit_from_branch(col) h.set_label_hist(ax, latex_branch, unit, bin_width=bin_width, density=False, fontsize=20) pt.fix_plot(ax, factor_ymax=1 + 0.3, show_leg=True, fontsize_ticks=15., fontsize_leg=20.) pt.show_grid(ax, which='major') i += 1 while i < len(_axes): ax = _axes[i] ax.axis('off') i += 1 #fig.subplots_adjust(wspace=0.3, hspace=0.7) if fig_name is None: fig_name = string.list_into_string(column) plt.tight_layout() pt.save_fig(fig, f"1D_hist_{fig_name}", folder_name=f'BDT/{folder_name}') return fig, axes
def plot_xys(ax, x, ly, xlabel, labels=None, colors=['b', 'g', 'r', 'y'], fontsize=default_fontsize['label'], markersize=1, linewidth=1., linestyle='-', factor_ymax=1., marker='.', elinewidth=None, annotations=None, fontsize_annot=default_fontsize['annotation'], space_x=-15, space_y=5, pos_text_LHC=None): """ Plot the curve(s) in `ly` as a function of `x`, with `annotations`. Parameters ---------- ax : matplotlib.axes.Axes axis where to plot x : list(float) abcissa of the points ly : list(list(float)) list of the ordinates of the points of the curves labels : list(str) labels of the curves xlabel : str label of the x-axis labels : str label of the curves colors : list(str) colors of each curve fontsize : float fontsize of the labels markersize : float size of the markers linewidth : float linewidth of the plotted curves linestyle : str linestyle of the plotted curves factor_ymax : float multiplicative factor of ymax marker : str marker style elinewidth : float width of the error bars annotations : list(str) list of the labels of the points - only if there is one curve (i.e., ``len(ly)==1``) fontsize_annot : float fontsize of the annotations space_x : float space in pixel from the point to the annotation text, projected in the x-axis space_y : float space in pixel from the point to the annotation text, projected in the y-axis pos_text_LHC : dict, list or str passed to :py:func:`HEA.plot.tools.set_text_LHCb` as the ``pos`` argument. """ colors = el_to_list(colors, len(ly)) plot_legend = False for i, y in enumerate(ly): label = labels[i] if len(ly) > 1 else None x = np.array(x) y = np.array(y) x_n = unumpy.nominal_values(x) y_n = unumpy.nominal_values(y) ax.errorbar(x_n, y_n, xerr=unumpy.std_devs(x), yerr=unumpy.std_devs(y), linestyle=linestyle, color=colors[i], markersize=markersize, elinewidth=elinewidth, linewidth=linewidth, label=label, marker=marker) if label is not None: plot_legend = True ax.set_xlabel(xlabel, fontsize=fontsize) if len(ly) == 1: ax.set_ylabel(labels[0], fontsize=fontsize) else: ax.set_ylabel('value', fontsize=fontsize) # Grid pt.show_grid(ax, which='major') pt.show_grid(ax, which='minor') # Ticks pt.fix_plot(ax, factor_ymax=factor_ymax, show_leg=plot_legend, fontsize_leg=25, ymin_to_0=False, pos_text_LHC=pos_text_LHC) if annotations is not None: assert len(ly) == 1 add_value_labels(ax, x_n, y_n, annotations, labelsize=fontsize_annot, space_x=space_x, space_y=space_y)