def plot_comparison(): extra_vars = [gcm().ltime_var] # Current mode stuff data = gcm().get_data([f.var for f in extra_vars]) add_variables.append_phsp(data) df_sel = final_selection.get_final_selection() df_sel &= selection.mass_signal_region() gen = get_model() outfile = gcm().get_output_path('effs') + 'Gen_DATA_Comp.pdf' with PdfPages(outfile) as pdf: for pc in gcm().phsp_vars + extra_vars: log.info('Plotting {}'.format(pc.var)) filled = gen[pc.var] errorbars = data[pc.var][df_sel] if pc.convert is not None: filled = pc.convert(filled) errorbars = pc.convert(errorbars) ax = comparison.plot_comparison(pc, filled, errorbars, 'Model', 'Data') ax.set_xlabel(pc.xlabel) ax.yaxis.set_visible(False) ax.legend() pdf.savefig(plt.gcf())
def train_reweighter(): extra_vars = [ gcm().ltime_var ] all_vars = gcm().phsp_vars + extra_vars columns = [v.var for v in all_vars if 'phi' not in v.var] columns += ['cosphi', 'sinphi'] # Current mode stuff data = gcm().get_data([f.var for f in extra_vars]) add_variables.append_phsp(data) data['cosphi'] = np.cos(data.phi1) data['sinphi'] = np.sin(data.phi1) df_sel = final_selection.get_final_selection() df_sel &= selection.delta_mass_signal_region() gen = get_model() gen['cosphi'] = np.cos(gen.phi1) gen['sinphi'] = np.sin(gen.phi1) limits = {v.var: v.binning[1:] for v in all_vars} limits['cosphi'] = (-1., 1) limits['sinphi'] = (-1., 1) for c in columns: mi, ma = limits[c] data[c] = (data[c] - mi) / (ma - mi) + 2. gen[c] = (gen[c] - mi) / (ma - mi) + 2. log.info('Training BDT reweighter for {}'.format(', '.join(columns))) reweighter = GBReweighter(n_estimators=300, max_depth=5, learning_rate=0.2) reweighter.fit(original=gen[columns].sample(n=250000), target=data[columns][df_sel].sample(n=250000)) bdt_utils.dump_reweighter(reweighter)
def lifetime_study(correct_efficiencies=False): # Current mode stuff data = gcm().get_data([gcm().ltime_var.var]) add_variables.append_phsp(data) df_sel = final_selection.get_final_selection() df_sel &= selection.delta_mass_signal_region() data['weight'] = 1. if correct_efficiencies: outfile = gcm().get_output_path('effs') + 'DATA_ltime_dep_effs.pdf' else: outfile = gcm().get_output_path('effs') + 'DATA_ltime_dep.pdf' percentiles = np.arange(0, 1.1, 0.2) boundaries = helpers.weighted_quantile( data[gcm().ltime_var.var][df_sel], percentiles) if correct_efficiencies: data['weight'] = 1./get_efficiency() boundaries = boundaries[1:] with PdfPages(outfile) as pdf: for var in gcm().phsp_vars: fig, ax = plt.subplots(figsize=(10, 10)) for low, high in zip(boundaries[:-1], boundaries[1:]): sel = (data[gcm().ltime_var.var] > low) & (data[gcm().ltime_var.var] < high) # NOQA df = data[var.var][df_sel & sel] weight = data['weight'][df_sel & sel] rlow, prec = helpers.rounder(low*1000, [low*1000, high*1000]) rhigh, _ = helpers.rounder(high*1000, [low*1000, high*1000]) spec = '{{:.{}f}}'.format(prec) label = r'${} < \tau \mathrm{{ [ps]}} < {}$'.format( spec.format(rlow), spec.format(rhigh)) values, edges = np.histogram(df, bins=int(var.binning[0]/5.), range=var.binning[1:], weights=weight) # NOQA err, edges = np.histogram(df, bins=int(var.binning[0]/5.), range=var.binning[1:], weights=weight**2) # NOQA norm = np.sum(values) values = values/norm err = np.sqrt(err)/norm x_ctr = (edges[1:] + edges[:-1])/2. width = (edges[1:] - edges[:-1]) x_err = width/2. options = dict( fmt='o', markersize=5, capthick=1, capsize=0, elinewidth=2, alpha=1) ax.errorbar(x_ctr, values, err, x_err, label=label, **options) ax.set_xlabel(var.xlabel) ax.yaxis.set_visible(False) ax.legend() pdf.savefig(plt.gcf()) plt.close()
def get_efficiency(): """Returns or first trains the BDT efficiency.""" extra_vars = [ gcm().ltime_var ] all_vars = gcm().phsp_vars + extra_vars columns = [v.var for v in all_vars if 'phi' not in v.var] columns += ['cosphi', 'sinphi'] log.info('Getting efficiencies for {}'.format(', '.join(columns))) # Current mode stuff data = gcm().get_data([f.var for f in extra_vars]) add_variables.append_phsp(data) data['cosphi'] = np.cos(data.phi1) data['sinphi'] = np.sin(data.phi1) return compute_efficiency(data)
def simple_phsp_efficiencies(): extra_vars = [ gcm().ltime_var ] # Current mode stuff data = gcm().get_data([f.var for f in extra_vars]) add_variables.append_phsp(data) df_sel = final_selection.get_final_selection() df_sel &= selection.delta_mass_signal_region() gen = get_model() outfile = gcm().get_output_path('effs') + 'Gen_DATA_Eff.pdf' with PdfPages(outfile) as pdf: for pc in gcm().phsp_vars + extra_vars: log.info('Plotting {}'.format(pc.var)) denominator = gen[pc.var] numerator = data[pc.var][df_sel] weight_d = np.ones(denominator.index.size)*1./denominator.index.size # NOQA weight_n = np.ones(numerator.index.size)*1./numerator.index.size fig, ax = plt.subplots(figsize=(10, 10)) if pc.convert is not None: numerator = pc.convert(numerator) denominator = pc.convert(denominator) x, y, x_err, y_err = helpers.make_efficiency( numerator, denominator, 100, weight_n, weight_d, independent=True) # NOQA options = dict( fmt='o', markersize=5, capthick=1, capsize=0, elinewidth=2, alpha=1) ax.errorbar(x, y, y_err, x_err, **options) ax.set_xlabel(pc.xlabel) ax.set_ylabel('Relative efficiency') pdf.savefig(plt.gcf()) plt.close()
def get_bdt_data(sw=False, sklearn=True, same_weight=False, comb_data=False, plot=False): """Returns the data for the bdt training, containing all the necessary variables and weights. :sw: Use sweights instead of sidebands and signal region :sklearn: return sklearn compatible dataframe :same_weight: Changes the weights to have identical normalisation :comb_data: Return BDT for comb_data, rand slow pion otherwise :returns: if sklearn is True: DataFrame if sklearn is False: sig_df, bkg_df, sig_wgt, bkg_wgt """ if comb_data: log.info('Doing returning data for combinatorial background') bdt_vars = gcm().comb_bkg_bdt_vars[:] bkg_sel = selection.comb_bkg_sideband_region() else: log.info('Doing returning data for random slow pion background') bdt_vars = gcm().rand_spi_bdt_vars[:] bkg_sel = selection.rand_spi_sideband_region() bdt_vars += gcm().spectator_vars # Only add the variables for plotting if needed if plot: bdt_vars += gcm().just_plot df = gcm().get_data( [v.var for v in bdt_vars if v.functor.additional is False]) add_variables.append_angle(df) sel = selection.full_selection() add_variables.append_phsp(df) add_variables.append_dtf_ip_diff(df) for f in bdt_vars: if f.convert is not None: df[f.var] = f.convert(df[f.var]) if sw: from analysis.mass_fitting import get_sweights sweights = get_sweights(gcm()) np.random.seed(42) df['labels'] = (np.random.rand(df.index.size) < 0.5).astype(np.int) sweights['bkg'] = sweights.rnd + sweights.comb df['weights'] = (df['labels'] == 1) * sweights.sig df['weights'] += (df['labels'] == 0) * ( sweights['rnd'] + sweights['comb']) # NOQA # Weights are only present for those rows that are selected, # so we select df = df.loc[~np.isnan(df['weights'])] else: df['weights'] = np.ones(df.index.size) df.loc[selection.mass_signal_region() & sel, 'labels'] = 1 df.loc[bkg_sel & sel, 'labels'] = 0 if config.add_wrongsign and gcm().mode not in config.wrong_sign_modes: with opposite_mode(): df_op = get_bdt_data(sw, sklearn=True) df = df.append(df_op.query('labels == 0'), ignore_index=True) tot0 = np.sum(df.query('labels == 0').weights) # Reduce the statistics of the signal to no more than 10 times the # background tot1 = np.sum(df.query('labels == 1').weights) if 5. * tot0 < tot1: sig_sel = df['labels'] == 1 bkg_sel = df['labels'] == 0 tot1_max = 5 * tot0 log.info('Changing signal events {} ---> {}'.format(tot1, tot1_max)) sel = df.index.isin( df.query('labels==1').sample(int(tot1_max), random_state=45).index) df.loc[sig_sel & sel, 'keep'] = 1 df.loc[bkg_sel, 'keep'] = 1 log.info('DataFrame content before: {}'.format(df.index.size)) if same_weight: # Weight the label 1 sample to have same total than label 0 df.loc[df['labels'] == 1, 'weights'] = float(tot0) / tot1_max df = df.loc[~np.isnan(df['labels']) & ~np.isnan(df['keep'])] log.info('DataFrame content after: {}'.format(df.index.size)) else: tot1_max = tot1 if same_weight: # Weight the label 1 sample to have same total than label 0 df.loc[df['labels'] == 1, 'weights'] = float(tot0) / tot1_max df = df.loc[~np.isnan(df['labels'])] if sklearn: return df return (df.query('labels == 1'), df.query('labels == 0'), df.query('labels == 1').weights, df.query('labels == 0').weights)
def phsp_comparison_plots(): """Plots the mode sidebands and the opposite mode signal region phsp distributions. Only really meaningful if executed for the WS events. Opposite mode is plotted as solid, with the uncertainty propagated to the mode error plot. """ # Beside phase space, also plot D0 momentum and flight distance extra_vars = [ gcm().ltime_var, PlotConfig(vars.pt, gcm().D0, (100, 0, 15000)), PlotConfig(vars.vdchi2, gcm().D0, (100, 0, 10), np.log, r'$\ln(\text{{{}}})$'), # NOQA ] # opposite_mode with opposite_mode(): OS = gcm().get_data([f.var for f in extra_vars]) add_variables.append_phsp(OS) os_sel = final_selection.get_final_selection() os_sel &= selection.delta_mass_signal_region() OS_weight = erf(OS[gcm().ltime_var.var] * 1600) / 24. + 0.038 + OS[ gcm().ltime_var.var] * 4 # NOQA # Current mode stuff DF = gcm().get_data([f.var for f in extra_vars]) add_variables.append_phsp(DF) df_sel = final_selection.get_final_selection() df_sel &= selection.mass_sideband_region() outfile = gcm().get_output_path('selection') + 'phsp_comp.pdf' with PdfPages(outfile) as pdf: for pc in gcm().phsp_vars + extra_vars: log.info('Plotting {}'.format(pc.var)) filled = OS[pc.var][os_sel] filled_weights = OS_weight[os_sel] errorbars = DF[pc.var][df_sel] if pc.convert is not None: filled = pc.convert(filled) errorbars = pc.convert(errorbars) ax = comparison.plot_comparison(pc, filled, errorbars, 'RS signal', 'WS background', normed_max=True) ax.set_xlabel(pc.xlabel) plot_utils.y_margin_scaler(ax, lf=0, la=True) ax.set_ylabel('Arbitrary units') ax.legend() pdf.savefig(plt.gcf()) plt.clf() ax = comparison.plot_comparison(pc, filled, errorbars, 'RS signal', 'WS background', filled_weight=filled_weights, normed_max=True) ax.set_xlabel(pc.xlabel) plot_utils.y_margin_scaler(ax, lf=0, la=True) ax.set_ylabel('Arbitrary units') ax.legend() pdf.savefig(plt.gcf())
def dependence_study(use_efficiencies=False): extra_vars = [ gcm().ltime_var ] all_vars = gcm().phsp_vars + extra_vars # Current mode stuff data = gcm().get_data([f.var for f in extra_vars]) add_variables.append_phsp(data) df_sel = final_selection.get_final_selection() df_sel &= selection.delta_mass_signal_region() gen = get_model() if use_efficiencies: outfile = gcm().get_output_path('effs') + 'Gen_DATA_Eff_dep_eff.pdf' gen['weight'] = get_efficiency_gen() else: outfile = gcm().get_output_path('effs') + 'Gen_DATA_Eff_dep.pdf' gen['weight'] = 1. lim_file = gcm().get_output_path('effs') + 'limits_for_eff.p' with PdfPages(outfile) as pdf: for selected, plotted in permutations(all_vars, 2): log.info('Plotting {} in intervals of {}'.format( plotted.var, selected.var)) percentiles = np.arange(0, 1.1, 0.2) boundaries = helpers.weighted_quantile( data[selected.var][df_sel], percentiles) fig, ax = plt.subplots(figsize=(10, 10)) for low, high in zip(boundaries[:-1], boundaries[1:]): num_sel = (data[selected.var] > low) & (data[selected.var] < high) # NOQA den_sel = (gen[selected.var] > low) & (gen[selected.var] < high) denominator = gen[plotted.var][den_sel] numerator = data[plotted.var][df_sel & num_sel] weight_d = gen['weight'][den_sel] weight_d /= np.sum(weight_d) weight_n = np.ones(numerator.index.size)*1./numerator.index.size # NOQA x, y, x_err, y_err = helpers.make_efficiency( numerator, denominator, 50, weight_n, weight_d, independent=True) # NOQA options = dict( fmt='o', markersize=5, capthick=1, capsize=0, elinewidth=2, alpha=1) rlow, prec = helpers.rounder(low, boundaries) rhigh, _ = helpers.rounder(high, boundaries) spec = '{{:.{}f}}'.format(prec) label = r'${} <$ {} $ < {}$'.format( spec.format(rlow), selected.xlabel, spec.format(rhigh)) ax.errorbar(x, y, y_err, x_err, label=label, **options) ax.set_xlabel(plotted.xlabel) ax.set_ylabel('Relative efficiency') try: limits = load(lim_file) except: log.info('Creating new limits file') limits = {} if limits is None: log.info('Creating new limits file') limits = {} if (plotted.var, selected.var) not in limits or use_efficiencies is False: # NOQA plot_utils.y_margin_scaler(ax, hf=0.4) limits[(plotted.var, selected.var)] = ax.get_ylim() else: log.info('Applying limits') lim = limits[(plotted.var, selected.var)] ax.set_ylim(lim) dump(limits, lim_file) ax.legend() pdf.savefig(plt.gcf()) plt.close()
def correlations(comb_bkg=False): sns.set(style="white") if comb_bkg: features_config = gcm().comb_bkg_bdt_vars bdt_folder = 'bdt_comb_bkg' bkg_sel = selection.comb_bkg_sideband_region() else: features_config = gcm().rand_spi_bdt_vars bdt_folder = 'bdt_rand_spi' bkg_sel = selection.rand_spi_sideband_region() functors = set() for pc in features_config: functors.add((pc.functor, pc.particle)) functors.add((m, gcm().D0)) functors.add((dtf_dm, None)) varlist = [f(p) for f, p in functors] nlist = [f.latex(p) for f, p in functors] df = gcm().get_data([i for i in varlist if 'angle' not in i]) for pc in gcm().phsp_vars: functors.add((pc.functor, pc.particle)) varlist = [f(p) for f, p in functors] nlist = [f.latex(p) for f, p in functors] sel = selection.full_selection() add_variables.append_angle(df) add_variables.append_phsp(df) df = df[sel] signal_sel = selection.mass_signal_region() suffix = ['sig', 'bkg'] for s, n in zip([signal_sel, bkg_sel], suffix): correlations = df.corr() correlations_array = np.asarray(df.corr()) row_linkage = hierarchy.linkage(correlations_array, method='average') from scipy.cluster.hierarchy import fcluster clusters = fcluster(row_linkage, 10, criterion='maxclust') clustered = list( next(zip(*sorted(zip(varlist, clusters), key=lambda x: x[1])))) clustered_names = list( next(zip(*sorted(zip(nlist, clusters), key=lambda x: x[1])))) correlations = correlations[clustered].loc[clustered] * 100 f, ax = plt.subplots(figsize=(15, 15)) mask = np.zeros_like(correlations, dtype=np.bool) mask[np.triu_indices_from(mask)] = True bla = sns.heatmap(correlations, mask=mask, annot=True, ax=ax, vmin=-100, square=True, vmax=100, fmt="+2.0f", linewidths=.8, yticklabels=clustered_names[1:], xticklabels=clustered_names[:-1], cbar=False) bla.set_xticklabels(bla.get_xticklabels(), rotation=90) bla.set_yticklabels(bla.get_yticklabels(), rotation=0) fn = 'correlations_{}.pdf'.format(n) outfile = gcm().get_output_path(bdt_folder) + fn bla.get_figure().savefig(outfile)