def test_bayesian_blocks(cmdopt, data_gen): be1 = bayesian_blocks(data_gen[0], p0=0.05) be2 = bayesian_blocks(data_gen[0], gamma=0.1) be3 = bayesian_blocks(data_gen[0], weights=data_gen[2]) if cmdopt == "generate": with open(answer_dir+'/answers_bayesian_blocks.npz', 'wb') as f: np.savez(f, be1=be1, be2=be2, be3=be3) elif cmdopt == "test": answers = np.load(answer_dir+'/answers_bayesian_blocks.npz') assert(np.all(be1 == answers['be1'])) assert(np.all(be2 == answers['be2'])) assert(np.all(be3 == answers['be3']))
def bb_optimizer(data, resample_list, roughs, elis): best_rank = np.inf best_p0 = 0 p0s = np.logspace(-4.5, 0, 50) for p0 in p0s: bb = bayesian_blocks(data, p0=p0) tmp_hist = np.histogram(data, bins=bb) tmp_hist_bw = tmp_hist[0]/np.diff(tmp_hist[1]) tmp_rough = rough((tmp_hist_bw, tmp_hist[1])) tmp_eli = [] for i in resample_list: tmp_eli.append(err_li(i, tmp_hist)) tmp_eli = np.mean(tmp_eli) rank_rough = rankdata(roughs + [tmp_rough]) rank_eli = rankdata(elis + [tmp_eli]) tmp_rank = rank_eli[-1] + rank_rough[-1] # print(p0, tmp_metric) if tmp_rank <= best_rank: best_p0 = p0 best_rank = tmp_rank return best_p0
def _df_binning_init(self, data, weights): '''Do an initial binning to get bin edges, total hist range, and break each set of data and weights into a dataframe (easier to handle errorbar calculation moving forward)''' # If bin edges are already determined, than skip initial histogramming self.bin_edges = None if isinstance(self.bins, Iterable) and not isinstance(self.bins, str): self.bin_edges = self.bins if self.bin_range is None: self.bin_range = (self.bin_edges[0], self.bin_edges[-1]) # If bin edges need to be determined, there's a few different cases to consider else: if self.stacked: _n_data_sets = 1 b_data = [np.concatenate(data)] if self.has_weights: b_weights = [np.concatenate(weights)] else: b_weights = None else: _n_data_sets = self.n_data_sets b_data = data b_weights = weights if self.bin_range is None: xmin = np.inf xmax = -np.inf for i in range(_n_data_sets): if len(data[i]) > 0: xmin = min(xmin, min(b_data[i])) xmax = max(xmax, max(b_data[i])) self.bin_range = (xmin, xmax) # Special case for Bayesian Blocks if self.bins in ['block', 'blocks']: # Single data-set or stacked if _n_data_sets == 1: if self.has_weights: b_weights = b_weights[0] else: b_weights = None self.bin_edges = bayesian_blocks(data=b_data[0], weights=b_weights, **self.bin_dict) else: raise ValueError( 'Cannot use Bayesian Blocks with multiple, unstacked datasets' ) else: _, self.bin_edges = np.histogram(b_data, bins=self.bins, weights=b_weights, range=self.bin_range) self.widths = np.diff(self.bin_edges) self.bin_centers = self.bin_edges[:-1] + self.widths * 0.5 # Now put the data into dataframes with the weights and bins self.df_list = [] for i in range(self.n_data_sets): if weights is None: df = pd.DataFrame({'data': data[i]}) else: df = pd.DataFrame({'data': data[i], 'weights': weights[i]}) df_bins = pd.cut(df.data, self.bin_edges, include_lowest=True) df['bins'] = df_bins self.df_list.append(df) # Make the initial histograms if self.histtype == 'marker': self.bin_content, _ = np.histogram(data, self.bin_edges, weights=weights, range=self.bin_range) else: self.bin_content, _, self.vis_object = self.ax.hist( data, self.bin_edges, weights=weights, range=self.bin_range, stacked=self.stacked, **self.hist_dict) # if self.stacked and self.errorbars and self.histtype == 'stepfilled': # plt.setp(self.vis_object[-1][0], edgecolor='k') # plt.setp(self.vis_object[-1][0], linewidth=2) self.bin_content_orig = self.bin_content[:] if self.errorbars == 'calc' and not (self.normed or self.scale): self.calc_bin_error(hist_mod='default')
def comp_study(input_data, n_events, xlims=None, resamples=100, dist_name='2Gauss'): bb_dir = os.path.join('/Users/brianpollack/Coding/BayesianBlocks') do_log = True # data_nom = input_data[:n_events] if dist_name == 'Gauss': np.random.seed(88) data_nom = np.random.normal(125, 2, size=n_events) resample_list = np.random.normal(125, 2, size=(resamples, n_events)) do_log = False elif dist_name == '2LP': np.random.seed(33) data_nom = np.concatenate( (np.random.laplace(loc=90, scale=5, size=int(n_events * 0.65)), np.random.laplace(loc=110, scale=1.5, size=int(n_events * 0.25)), np.random.uniform(low=80, high=120, size=int(n_events * 0.10)))) resample_list = np.concatenate( (np.random.laplace( loc=90, scale=5, size=(resamples, int(n_events * 0.65))), np.random.laplace( loc=110, scale=1.5, size=(resamples, int(n_events * 0.25))), np.random.uniform( low=80, high=120, size=(resamples, int(n_events * 0.10)))), axis=1) do_log = False elif dist_name == 'jPT': np.random.seed(11) data_nom = np.random.choice(input_data, size=n_events, replace=False) resample_list = np.random.choice(input_data, size=(resamples, n_events), replace=True) elif dist_name == 'DY': np.random.seed(200) data_nom = np.random.choice(input_data, size=n_events, replace=False) resample_list = np.random.choice(input_data, size=(resamples, n_events), replace=True) else: np.random.seed(1) data_nom = np.random.choice(input_data, size=n_events, replace=False) resample_list = np.random.choice(input_data, size=(resamples, n_events), replace=True) fig_hist, axes_hist = plt.subplots(3, 3, sharex=True, sharey=False, constrained_layout=True) fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}', fontsize=22) # fig_hist.text(-0.03, 0.5, 'Entries/Bin Width', va='center', rotation='vertical', fontsize=20) # axes_hist[2][0].get_xaxis().set_ticks([]) # axes_hist[2][1].get_xaxis().set_ticks([]) # axes_hist[2][2].get_xaxis().set_ticks([]) axes_hist[0][0].set_title('Sturges') hist_sturges_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='sturges', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[0][0]) axes_hist[0][1].set_title('Doane') hist_doane_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='doane', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[0][1]) axes_hist[0][2].set_title('Scott') hist_scott_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='scott', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[0][2]) axes_hist[1][0].set_title('Freedman Diaconis') axes_hist[1][0].set_ylabel('Entries/Bin Width', fontsize=20) hist_fd_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='fd', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[1][0]) axes_hist[1][1].set_title('Knuth') _, bk = knuth_bin_width(data_nom, return_bins=True) hist_knuth_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins=bk, errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[1][1]) axes_hist[1][2].set_title('Rice') hist_rice_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='rice', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[1][2]) axes_hist[2][0].set_title('Sqrt(N)') hist_sqrt_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='sqrt', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[2][0]) # bep = bep_optimizer(data_nom) # _, bep = pd.qcut(data_nom, nep, retbins=True) hist_sturges = np.histogram(data_nom, bins='sturges') hist_doane = np.histogram(data_nom, bins='doane') hist_scott = np.histogram(data_nom, bins='scott') hist_fd = np.histogram(data_nom, bins='fd') hist_knuth = np.histogram(data_nom, bins=bk) hist_rice = np.histogram(data_nom, bins='rice') hist_sqrt = np.histogram(data_nom, bins='sqrt') r_sturges = rough(hist_sturges_bw, plot=False) r_doane = rough(hist_doane_bw) r_scott = rough(hist_scott_bw) r_fd = rough(hist_fd_bw) r_knuth = rough(hist_knuth_bw, plot=False) r_rice = rough(hist_rice_bw) r_sqrt = rough(hist_sqrt_bw, plot=False) eli_sturges = err_li(data_nom, hist_sturges) eli_doane = err_li(data_nom, hist_doane) eli_scott = err_li(data_nom, hist_scott) eli_fd = err_li(data_nom, hist_fd) eli_knuth = err_li(data_nom, hist_knuth) eli_rice = err_li(data_nom, hist_rice) eli_sqrt = err_li(data_nom, hist_sqrt) avg_eli_sturges = [] avg_eli_doane = [] avg_eli_scott = [] avg_eli_fd = [] avg_eli_knuth = [] avg_eli_rice = [] avg_eli_sqrt = [] for i in resample_list: avg_eli_sturges.append(err_li(i, hist_sturges)) avg_eli_doane.append(err_li(i, hist_doane)) avg_eli_scott.append(err_li(i, hist_scott)) avg_eli_fd.append(err_li(i, hist_fd)) avg_eli_knuth.append(err_li(i, hist_knuth)) avg_eli_rice.append(err_li(i, hist_rice)) avg_eli_sqrt.append(err_li(i, hist_sqrt)) avg_eli_sturges = np.mean(avg_eli_sturges) avg_eli_doane = np.mean(avg_eli_doane) avg_eli_scott = np.mean(avg_eli_scott) avg_eli_fd = np.mean(avg_eli_fd) avg_eli_knuth = np.mean(avg_eli_knuth) avg_eli_rice = np.mean(avg_eli_rice) avg_eli_sqrt = np.mean(avg_eli_sqrt) avg_eli_list = [ avg_eli_sturges, avg_eli_doane, avg_eli_scott, avg_eli_fd, avg_eli_knuth, avg_eli_rice, avg_eli_sqrt ] r_list = [r_sturges, r_doane, r_scott, r_fd, r_knuth, r_rice, r_sqrt] elis_list = [ eli_sturges, eli_doane, eli_scott, eli_fd, eli_knuth, eli_rice, eli_sqrt ] axes_hist[2][1].set_title('Equal Population') bep = bep_optimizer(data_nom, resample_list, r_list, avg_eli_list) hist_ep_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins=bep, errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[2][1]) hist_ep = np.histogram(data_nom, bins=bep) r_ep = rough(hist_ep_bw) eli_ep = err_li(data_nom, hist_ep) avg_eli_ep = [] for i in resample_list: avg_eli_ep.append(err_li(i, hist_ep)) avg_eli_ep = np.mean(avg_eli_ep) axes_hist[2][2].set_title('Bayesian Blocks') p0 = bb_optimizer(data_nom, resample_list, r_list, avg_eli_list) bb = bayesian_blocks(data_nom, p0=p0) if xlims: bb[0] = xlims[0] bb[-1] = xlims[-1] hist_bb_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins=bb, errorbars=False, alpha=1, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[2][2]) # if n_events == 1000 and dist_name == '2LP': # axes_hist[2][2].set_ylim((0, 100)) hist_bb = np.histogram(data_nom, bins=bb) r_bb = rough(hist_bb_bw, plot=False) eli_bb = err_li(data_nom, hist_bb) avg_eli_bb = [] for i in resample_list: avg_eli_bb.append(err_li(i, hist_bb)) avg_eli_bb = np.mean(avg_eli_bb) r_list.append(r_ep) r_list.append(r_bb) avg_eli_list.append(avg_eli_ep) avg_eli_list.append(avg_eli_bb) elis_list.append(eli_ep) elis_list.append(eli_bb) plt.savefig(bb_dir + f'/plots/bin_comp/hists_{dist_name}_{n_events}.pdf') xs = [ 'Sturges', 'Doane', 'Scott', 'FD', 'Knuth', 'Rice', 'Sqrt', 'EP', 'BB' ] fig_metric, axes_metric = plt.subplots(2, 1, constrained_layout=True) fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}') for i in range(len(elis_list)): if xs[i] == 'BB': axes_metric[0].scatter(avg_eli_list[i], r_list[i], label=xs[i], s=400, marker='*', c='k') else: axes_metric[0].scatter(avg_eli_list[i], r_list[i], label=xs[i], s=200) axes_metric[0].set_ylabel(r'$W_n$ (Wiggles)') axes_metric[0].set_xlabel(r'$\hat{E}$ (Average Error)') # ax = plt.gca() # ax.set_yscale('log') # ax.set_xscale('log') # ax.relim() # ax.autoscale_view() axes_metric[0].grid() axes_metric[0].legend(ncol=1, bbox_to_anchor=(1.05, 1.15), loc='upper left') axes_metric[0].set_title(f'{dist_name} Distribution, N={n_events}', fontsize=22) # plt.savefig(bb_dir+f'/plots/bin_comp/scat_{dist_name}_{n_events}.pdf') # plt.figure() rank_rough = rankdata(r_list, method='min') rank_avg_eli = rankdata(avg_eli_list, method='min') cont = axes_metric[1].bar(xs, rank_rough, 0.35, label=r'$W_n$ Ranking', alpha=0.5) cont[-1].set_alpha(1) cont = axes_metric[1].bar(xs, rank_avg_eli, 0.35, bottom=rank_rough, label=r'$\hat{E}$ Ranking', alpha=0.5) cont[-1].set_alpha(1) axes_metric[1].legend(loc='upper left', bbox_to_anchor=(1.0, 0.8)) # axes_metric[1].set_title(f'Combined Ranking, {dist_name} Distribution, N={n_events}') axes_metric[1].set_xlabel('Binning Method') axes_metric[1].set_ylabel('Rank') plt.savefig(bb_dir + f'/plots/bin_comp/metric_{dist_name}_{n_events}.pdf')