def plot_hist_fit_auto(df, branch, cut_BDT=None, **kwargs): """ Retrieve the latex name of the branch and unit. Set the folder name to the name of the datasets. Then, plot 2d histogram with plot_hist_fit. Parameters ---------- df : pandas.Dataframe dataframe that contains the branch to plot branch : str branch (for instance: ``'B0_M'``), in dataframe cut_BDT : float or str ``BDT > cut_BDT`` cut. Used in the name of saved figure. **kwargs : dict arguments passed in :py:func:`plot_hist_fit` (except ``branch``, ``latex_branch``, ``unit``) Returns ------- fig : matplotlib.figure.Figure Figure of the plot (only if ``axis_mode`` is ``False``) ax[0] : matplotlib.figure.Axes Axis of the histogram + fitted curves + table ax[1] : matplotlib.figure.Axes Axis of the pull diagram (only if ``plot_pull`` is ``True``) """ # Retrieve particle name, and branch name and unit. # particle, var = retrieve_particle_branch(branch) # latex_branch = branchs_params[var]['name'] # unit = branchs_params[var]['unit'] # name_particle = particle_names[particle] latex_branch, unit = pt.get_latex_branches_units(branch) # Title and name of the file with BDT add_in_dic('fig_name', kwargs) add_in_dic('title', kwargs) add_in_dic('data_name', kwargs) kwargs['fig_name'] = pt._get_fig_name_given_BDT_cut( fig_name=kwargs['fig_name'], cut_BDT=cut_BDT, branch=branch, data_name=string.add_text(kwargs['data_name'], 'fit', '_', None)) kwargs['title'] = pt._get_title_given_BDT_cut(title=kwargs['title'], cut_BDT=cut_BDT) # Name of the folder = name of the data add_in_dic('folder_name', kwargs) if kwargs['folder_name'] is None and kwargs['data_name'] is not None: kwargs['folder_name'] = kwargs['data_name'] return plot_hist_fit(df, branch, latex_branch=latex_branch, unit=unit, **kwargs)
def _get_title_given_BDT_cut(title, cut_BDT): """ Return the the new title given the cut on the BDT Parameters ---------- title : str initial title cut_BDT : float cut on the BDT (we keep ``BDT > {cut_BDT}``) Returns ------- title: str new title Examples -------- >>> _get_title_given_BDT_cut("title", -0.1) "title - BDT > -0.1" """ # Title with BDT if cut_BDT is not None: title = string.add_text(title, f"BDT $>$ {cut_BDT}", ' - ') return title
def get_raw_branch(particle, raw_variable): """ Parameters ---------- particle: str or tuple(str) or None particle or list of particles raw_variable: str name of the raw variable Returns ------- raw_branch: str or tuple(str) Name of the raw branch or tuple of the names of the raw branches * ``{raw_quantity}`` if there is one raw quantity, zero particle * ``raw_variable = {particle}_{raw_quantity}`` if there is one raw quantity, one particle * Tuple of ``tuple({raw_variable[0]},{raw_variable[1]})`` if there is two raw variables * ... """ if assertion.is_list_tuple(raw_variable): particle = tuple(el_to_list(particle, len(raw_variable))) return tuple(RVariable.get_raw_branch(sub_particle, sub_raw_quantity) for sub_particle, sub_raw_quantity in zip(particle, raw_variable)) return add_text(particle, raw_variable)
def apply_BDT(df_tot, df_train, bdt, BDT_name=None, save_BDT=False, kind_data='common'): """ * Apply the BDT to the dataframe ``df_train`` which contains only the training variable. * Add the BDT output as a new variable in ``df_tot``. * Save ``df_tot`` in a root file ``{loc['root']}/{kind_data}_{ BDT_name}.root`` (branch ``'DecayTree'``) * In addition, save the BDT output in a separated root file ``{loc['root']t/BDT_{BDT_name}.root`` (branch ``'BDT'``) * if ``save_BDT`` is ``True``, save the BDT in a root file ``{loc['pickle']}/bdt_{BDT_name}.pickle`` Parameters ---------- df_tot : pandas.Dataframe dataframe that will be saved together with the BDT output df_train : pandas.Dataframe dataframe with only the variables that have been used for the training bdt : sklearn.ensemble.AdaBoostClassifier or sklearn.ensemble.GradientBoostingClassifier trained BDT classifier BDT_name : str name of the BDT, used for the name of the saved files save_BDT : bool if ``True``, save the BDT in a pickle file kind_data : str name of the data where the BDT is applied to (e.g., ``'MC'``, ``'common'``, ...) """ # Apply the BDT to the dataframe that contains only the variables used in # the training, in the right order df_tot['BDT'] = bdt.decision_function(df_train) file_name = string.add_text(kind_data, BDT_name, '_') df = pd.DataFrame() df['BDT'] = df_tot['BDT'] save_root(df, 'BDT_' + file_name, 'DecayTree') save_root(df_tot, file_name, 'DecayTree') if save_BDT: dump_pickle(bdt, string.add_text('bdt', file_name, '_'))
def _get_fig_name_given_BDT_cut(fig_name=None, cut_BDT=None, branch="", data_name=None): """ Return the new name of the file and the new title given the cut on the BDT Parameters ---------- fig_name : str initial name of the file cut_BDT : float cut on the BDT (we keep ``BDT > {cut_BDT}``) branch : float a name of branch (e.g., ``'B0_M'``) data_name : str or None name of the plotted data Returns ------- fig_name: str new fig_name Examples -------- >>> _get_title_given_BDT_cut("fig_name", -0.1, 'B0_M', 'MC') "fig_BDT_name-0.1" >>> _get_title_given_BDT_cut(None, -0.1, 'B0_M', 'MC') "B0_M_MC_BDT-0.1" """ assert (fig_name is not None) or (data_name is not None) if fig_name is None: fig_name = string.add_text(branch, data_name, '_') # Title with BDT if cut_BDT is not None: fig_name = string.add_text(fig_name, f'BDT{cut_BDT}') return fig_name
def quantity(self): """ Combined physical quantity, with the function applied to it. Only defined if there is zero or one particle. * ``{raw_quantity}`` if there is one raw branch and no function * ``{raw_quantity}:{name_function}`` if there is one raw quantity and a function * ``{raw_quantity[0]},{raw_quantity[1]}:{name_function}`` if there are two raw variables and a function """ if assertion.is_list_tuple(self.particle) and len(self.particle) > 1: return None else: if isinstance(self.raw_quantity, str): raw_quantities = [self.raw_quantity] else: raw_quantities = self.raw_quantity return add_text(','.join(raw_quantities), self.name_function, sep=':')
def classification_report_print(X_test, y_test, bdt, BDT_name=None): """ Test the bdt training with the testing sample.\ Print and save the report in ``{loc['tables']}/BDT/{BDT_name}/classification_report.txt``. Parameters ---------- X_text : numpy.ndarray Array with signal and MC data concatenated and shuffled for test y_test : numpy.array Array with 1 for the signal events, and 0 for background events (shuffled) for test bdt : sklearn.ensemble.AdaBoostClassifier or sklearn.ensemble.GradientBoostingClassifier trained classifier BDT_name : str name of the BDT, used for the path of the saved txt file. """ # if xgboost: # y_predicted = xgbmodel.predict_proba(X)[:,1] # else: y_predicted = bdt.predict(X_test) classification_report_str = classification_report( y_test, y_predicted, target_names=["background", "signal"]) print(classification_report_str) ROC_AUC_score = roc_auc_score( y_test, # real bdt.decision_function(X_test)) # bdt.decision_function(X_test) = scores = returns a Numpy array, in which each element # represents whether a predicted sample for x_test by the classifier lies to the right # or left side of the Hyperplane and also how far from the HyperPlane. print("Area under ROC curve: %.4f" % (ROC_AUC_score)) # Write the results ----- fig_name = string.add_text('classification_report', BDT_name, '_') path = create_directory(f"{loc['tables']}/BDT/", BDT_name) with open(f"{path}/{fig_name}.txt", 'w') as f: f.write(classification_report_str) f.write("Area under ROC curve: %.4f" % (ROC_AUC_score))
def plot_scatter2d(dfs, branches, latex_branches, units=[None, None], low=None, high=None, n_bins=100, colors=['g', 'r', 'o', 'b'], data_name=None, title=None, fig_name=None, folder_name=None, fontsize_label=default_fontsize['label'], save_fig=True, ax=None, get_sc=False, pos_text_LHC=None, **params): """ Plot a 2D histogram of 2 branches. Parameters ---------- dfs : pandas.Dataframe or list(pandas.Dataframe) Dataset or list of datasets. branches : [str, str] names of the two branches latex_branches : [str, str] latex names of the two branches units : str or [str, str] Common unit or list of two units of the two branches n_bins : int or [int, int] number of bins log_scale : bool if true, the colorbar is in logscale low : float or [float, float] low value(s) of the branches high : float or [float, float] high value(s) of the branches data_name : str name of the data, this is used to define the name of the figure, in the case ``fig_name`` is not defined, and define the legend if there is more than 1 dataframe. colors : str or list(str) color(s) used for the histogram(s) title : str title of the figure fig_name : str name of the saved figure folder_name : str name of the folder where to save the figure fontsize_label : float fontsize of the label of the axes save_fig : bool specifies if the figure is saved ax : matplotlib.axes.Axes axis where to plot get_sc : bool if True: get the scatter plot pos_text_LHC : dict, list or str passed to :py:func:`HEA.plot.tools.set_text_LHCb` as the ``pos`` argument. Returns ------- fig : matplotlib.figure.Figure Figure of the plot (only if ``ax`` is not specified) ax : matplotlib.figure.Axes Axis of the plot (only if ``ax`` is not specified) scs : matplotlib.PathCollection or list(matplotlib.PathCollection) scatter plot or list of scatter plots (only if ``get_sc`` is ``True``) """ # low, high and units into a list of size 2 low = el_to_list(low, 2) high = el_to_list(high, 2) units = el_to_list(units, 2) if ax is not None: save_fig = False fig, ax = get_fig_ax(ax) title = string.add_text(None, title, default=None) ax.set_title(title, fontsize=25) scs = [None] * len(dfs) for k, (data_name, df) in enumerate(dfs.items()): scs[k] = ax.scatter(df[branches[0]], df[branches[1]], c=colors[k], label=data_name, **params) if len(scs) == 1: scs = scs[0] ax.set_xlim([low[0], high[0]]) ax.set_ylim([low[1], high[1]]) # Label, color bar pt.set_label_ticks(ax) pt.set_text_LHCb(ax, pos=pos_text_LHC) set_label_2Dhist(ax, latex_branches, units, fontsize=fontsize_label) # Save the data if save_fig: pt.save_fig( fig, fig_name, folder_name, string.add_text(string.list_into_string(branches, '_vs_'), string.list_into_string(data_name, '_'), '_')) if fig is not None: if get_sc: return fig, ax, scs else: return fig, ax else: if get_sc: return scs
def plot_hist2d(df, branches, latex_branches, units, low=None, high=None, n_bins=100, log_scale=False, title=None, fig_name=None, folder_name=None, data_name=None, save_fig=True, ax=None, pos_text_LHC=None): """ Plot a 2D histogram of 2 branches. Parameters ---------- df : pandas.Dataframe Dataframe that contains the 2 branches to plot branches : [str, str] names of the two branches latex_branches : [str, str] latex names of the two branches units : str or [str, str] Common unit or list of two units of the two branches n_bins : int or [int, int] number of bins log_scale : bool if true, the colorbar is in logscale low : float or [float, float] low value(s) of the branches high : float or [float, float] high value(s) of the branches title : str title of the figure fig_name : str name of the saved figure folder_name : str name of the folder where to save the figure data_name : str name of the data, this is used to define the name of the figure, in the case ``fig_name`` is not defined. save_fig : bool specifies if the figure is saved ax : matplotlib.axes.Axes axis where to plot pos_text_LHC : dict, list or str passed to :py:func:`HEA.plot.tools.set_text_LHCb` as the ``pos`` argument. Returns ------- fig : matplotlib.figure.Figure Figure of the plot (only if ``ax`` is not specified) ax : matplotlib.figure.Axes Axis of the plot (only if ``ax`` is not specified) """ # low, high and units into a list of size 2 low = el_to_list(low, 2) high = el_to_list(high, 2) units = el_to_list(units, 2) for i in range(2): low[i], high[i] = pt._redefine_low_high(low[i], high[i], df[branches[i]]) # Plotting fig, ax = get_fig_ax(ax) title = string.add_text(data_name, title, default=None) ax.set_title(title, fontsize=25) if log_scale: _, _, _, h = ax.hist2d(df[branches[0]], df[branches[1]], range=[[low[0], high[0]], [low[1], high[1]]], bins=n_bins, norm=LogNorm()) else: _, _, _, h = ax.hist2d(df[branches[0]], df[branches[1]], range=[[low[0], high[0]], [low[1], high[1]]], bins=n_bins) # Label, color bar pt.set_label_ticks(ax) pt.set_text_LHCb(ax, pos=pos_text_LHC) set_label_2Dhist(ax, latex_branches, units, fontsize=25) cbar = plt.colorbar(h) cbar.ax.tick_params(labelsize=20) return end_plot_function(fig, save_fig=save_fig, fig_name=fig_name, folder_name=folder_name, default_fig_name=string.add_text( string.list_into_string(branches, '_vs_'), data_name, '_'), ax=ax)
def _plot_single_model(ax, x, model, plot_scaling, model_type=None, model_name=None, frac=1., color='b', linestyle='-', line_width=2.5, alpha=1): """ Plot the models recursively with a label for the curve ``"{name of the PDF (e.g., Gaussian, ...)} - {type of the model, e.g., signal ...} {Name of the model, e.g., "B0->Dst Ds"}"`` (if ``model_name`` is specified) ax : matplotlib.axes.Axes axis where to plot x : numpy.numpy(float) points of the x-axis where to evaluate the pdf of the model to plot model : zfit.pdf.BasePDF just one zfit model plot_scaling : float scaling to get the scale of the curve right model_type : str type of the model * ``'m'`` : model (sum) ; should always be the FIRST ONE !! * ``'s'`` : signal * ``'b'`` : background used in the legend to indicate if it is a signal or a background component model_name : str name of the models - used in the legend. If ``None``, the legend is not shown frac : float frac is multiplied to the PDF to get the correct scale due to composite PDFs color : str list of colors for each curve, same structure as ``models_names`` linestyle : str line style of the curve PDF_level : int Level of the PDF: * 0 is first sumPDF * 1 if component of this sumPDF * 2 if component of a sumPDF component of sumPDF * etc. line_width : float width of the plotted lines """ assert not assertion.is_list_tuple(model) # Label if model_name is not None: label_model = f'{get_model_name(model)} - {model_names_types[model_type]}' label_model = string.add_text(label_model, model_name) else: label_model = None plot_fitted_curve(ax, model, plot_scaling, frac=frac, line_width=line_width, color=color, linestyle=linestyle, label=label_model, x=x, alpha=alpha)
def correlations(data, fig_name=None, folder_name=None, title=None, **kwds): """ Calculate pairwise correlation between features of the dataframe data and save the figure in ``{loc['plot']}/BDT/{folder_name}/corr_matrix_{fig_name}`` Parameters ---------- data : pandas.Dataframe dataset fig_name : str name of the saved file folder_name : str name of the folder where to save the plot **kwds : dict other plotting keyword arguments, to be passed to ``pandas.DataFrame.corr()`` Returns ------- fig : matplotlib.figure.Figure Figure of the plot ax : matplotlib.figure.Axes Axis of the plot """ # simply call df.corr() to get a table of # correlation values if you do not need # the fancy plotting corrmat = data.corr(**kwds) # correlation fig, ax1 = plt.subplots(ncols=1, figsize=(12, 10)) # 1 plot opts = { 'cmap': plt.get_cmap("RdBu"), # red blue color mode 'vmin': -1, 'vmax': +1 } # correlation between -1 and 1 heatmap1 = ax1.pcolor(corrmat, **opts) # create a pseudo color plot plt.colorbar(heatmap1, ax=ax1) # color bar title = string.add_text("Correlations", title, ' - ') ax1.set_title(title) labels = list(corrmat.columns.values) # get the list of labels for i, label in enumerate(labels): latex_branch, _ = RVariable.get_latex_branch_unit_from_branch(label) labels[i] = latex_branch # shift location of ticks to center of the bins ax1.set_xticks(np.arange(len(labels)) + 0.5, minor=False) ax1.set_yticks(np.arange(len(labels)) + 0.5, minor=False) ax1.set_xticklabels(labels, minor=False, ha='right', rotation=70) ax1.set_yticklabels(labels, minor=False) plt.tight_layout() if fig_name is None: fig_name = string.list_into_string(column) pt.save_fig(fig, f"corr_matrix_{fig_name}", folder_name=f'BDT/{folder_name}') return fig, ax1