コード例 #1
0
def test_bayesian_blocks(cmdopt, data_gen):

    be1 = bayesian_blocks(data_gen[0], p0=0.05)
    be2 = bayesian_blocks(data_gen[0], gamma=0.1)
    be3 = bayesian_blocks(data_gen[0], weights=data_gen[2])

    if cmdopt == "generate":
        with open(answer_dir+'/answers_bayesian_blocks.npz', 'wb') as f:
            np.savez(f, be1=be1, be2=be2, be3=be3)
    elif cmdopt == "test":
        answers = np.load(answer_dir+'/answers_bayesian_blocks.npz')
        assert(np.all(be1 == answers['be1']))
        assert(np.all(be2 == answers['be2']))
        assert(np.all(be3 == answers['be3']))
コード例 #2
0
def bb_optimizer(data, resample_list, roughs, elis):
    best_rank = np.inf
    best_p0 = 0
    p0s = np.logspace(-4.5, 0, 50)
    for p0 in p0s:
        bb = bayesian_blocks(data, p0=p0)
        tmp_hist = np.histogram(data, bins=bb)
        tmp_hist_bw = tmp_hist[0]/np.diff(tmp_hist[1])
        tmp_rough = rough((tmp_hist_bw, tmp_hist[1]))
        tmp_eli = []
        for i in resample_list:
            tmp_eli.append(err_li(i, tmp_hist))
        tmp_eli = np.mean(tmp_eli)

        rank_rough = rankdata(roughs + [tmp_rough])
        rank_eli = rankdata(elis + [tmp_eli])
        tmp_rank = rank_eli[-1] + rank_rough[-1]
        # print(p0, tmp_metric)

        if tmp_rank <= best_rank:
            best_p0 = p0
            best_rank = tmp_rank
    return best_p0
コード例 #3
0
    def _df_binning_init(self, data, weights):
        '''Do an initial binning to get bin edges, total hist range, and break each set of data and
        weights into a dataframe (easier to handle errorbar calculation moving forward)'''

        # If bin edges are already determined, than skip initial histogramming
        self.bin_edges = None
        if isinstance(self.bins, Iterable) and not isinstance(self.bins, str):
            self.bin_edges = self.bins
            if self.bin_range is None:
                self.bin_range = (self.bin_edges[0], self.bin_edges[-1])

        # If bin edges need to be determined, there's a few different cases to consider
        else:
            if self.stacked:
                _n_data_sets = 1
                b_data = [np.concatenate(data)]
                if self.has_weights:
                    b_weights = [np.concatenate(weights)]
                else:
                    b_weights = None
            else:
                _n_data_sets = self.n_data_sets
                b_data = data
                b_weights = weights

            if self.bin_range is None:
                xmin = np.inf
                xmax = -np.inf
                for i in range(_n_data_sets):
                    if len(data[i]) > 0:
                        xmin = min(xmin, min(b_data[i]))
                        xmax = max(xmax, max(b_data[i]))
                self.bin_range = (xmin, xmax)

            # Special case for Bayesian Blocks
            if self.bins in ['block', 'blocks']:

                # Single data-set or stacked
                if _n_data_sets == 1:

                    if self.has_weights:
                        b_weights = b_weights[0]
                    else:
                        b_weights = None
                    self.bin_edges = bayesian_blocks(data=b_data[0],
                                                     weights=b_weights,
                                                     **self.bin_dict)
                else:
                    raise ValueError(
                        'Cannot use Bayesian Blocks with multiple, unstacked datasets'
                    )

            else:
                _, self.bin_edges = np.histogram(b_data,
                                                 bins=self.bins,
                                                 weights=b_weights,
                                                 range=self.bin_range)

        self.widths = np.diff(self.bin_edges)
        self.bin_centers = self.bin_edges[:-1] + self.widths * 0.5

        # Now put the data into dataframes with the weights and bins
        self.df_list = []
        for i in range(self.n_data_sets):
            if weights is None:
                df = pd.DataFrame({'data': data[i]})
            else:
                df = pd.DataFrame({'data': data[i], 'weights': weights[i]})
            df_bins = pd.cut(df.data, self.bin_edges, include_lowest=True)
            df['bins'] = df_bins
            self.df_list.append(df)

        # Make the initial histograms
        if self.histtype == 'marker':
            self.bin_content, _ = np.histogram(data,
                                               self.bin_edges,
                                               weights=weights,
                                               range=self.bin_range)
        else:
            self.bin_content, _, self.vis_object = self.ax.hist(
                data,
                self.bin_edges,
                weights=weights,
                range=self.bin_range,
                stacked=self.stacked,
                **self.hist_dict)

            # if self.stacked and self.errorbars and self.histtype == 'stepfilled':
            #     plt.setp(self.vis_object[-1][0], edgecolor='k')
            #     plt.setp(self.vis_object[-1][0], linewidth=2)

        self.bin_content_orig = self.bin_content[:]

        if self.errorbars == 'calc' and not (self.normed or self.scale):
            self.calc_bin_error(hist_mod='default')
コード例 #4
0
def comp_study(input_data,
               n_events,
               xlims=None,
               resamples=100,
               dist_name='2Gauss'):
    bb_dir = os.path.join('/Users/brianpollack/Coding/BayesianBlocks')
    do_log = True

    # data_nom = input_data[:n_events]
    if dist_name == 'Gauss':
        np.random.seed(88)
        data_nom = np.random.normal(125, 2, size=n_events)
        resample_list = np.random.normal(125, 2, size=(resamples, n_events))
        do_log = False

    elif dist_name == '2LP':
        np.random.seed(33)
        data_nom = np.concatenate(
            (np.random.laplace(loc=90, scale=5, size=int(n_events * 0.65)),
             np.random.laplace(loc=110, scale=1.5, size=int(n_events * 0.25)),
             np.random.uniform(low=80, high=120, size=int(n_events * 0.10))))
        resample_list = np.concatenate(
            (np.random.laplace(
                loc=90, scale=5, size=(resamples, int(n_events * 0.65))),
             np.random.laplace(
                 loc=110, scale=1.5, size=(resamples, int(n_events * 0.25))),
             np.random.uniform(
                 low=80, high=120, size=(resamples, int(n_events * 0.10)))),
            axis=1)
        do_log = False

    elif dist_name == 'jPT':
        np.random.seed(11)
        data_nom = np.random.choice(input_data, size=n_events, replace=False)
        resample_list = np.random.choice(input_data,
                                         size=(resamples, n_events),
                                         replace=True)

    elif dist_name == 'DY':
        np.random.seed(200)
        data_nom = np.random.choice(input_data, size=n_events, replace=False)
        resample_list = np.random.choice(input_data,
                                         size=(resamples, n_events),
                                         replace=True)
    else:
        np.random.seed(1)
        data_nom = np.random.choice(input_data, size=n_events, replace=False)
        resample_list = np.random.choice(input_data,
                                         size=(resamples, n_events),
                                         replace=True)

    fig_hist, axes_hist = plt.subplots(3,
                                       3,
                                       sharex=True,
                                       sharey=False,
                                       constrained_layout=True)
    fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}', fontsize=22)
    # fig_hist.text(-0.03, 0.5, 'Entries/Bin Width', va='center', rotation='vertical', fontsize=20)
    # axes_hist[2][0].get_xaxis().set_ticks([])
    # axes_hist[2][1].get_xaxis().set_ticks([])
    # axes_hist[2][2].get_xaxis().set_ticks([])

    axes_hist[0][0].set_title('Sturges')
    hist_sturges_bw = skh_plt.hist(x=data_nom,
                                   histtype='stepfilled',
                                   bins='sturges',
                                   errorbars=False,
                                   alpha=0.5,
                                   log=do_log,
                                   scale='binwidth',
                                   err_type='gaussian',
                                   ax=axes_hist[0][0])

    axes_hist[0][1].set_title('Doane')
    hist_doane_bw = skh_plt.hist(x=data_nom,
                                 histtype='stepfilled',
                                 bins='doane',
                                 errorbars=False,
                                 alpha=0.5,
                                 log=do_log,
                                 scale='binwidth',
                                 err_type='gaussian',
                                 ax=axes_hist[0][1])

    axes_hist[0][2].set_title('Scott')
    hist_scott_bw = skh_plt.hist(x=data_nom,
                                 histtype='stepfilled',
                                 bins='scott',
                                 errorbars=False,
                                 alpha=0.5,
                                 log=do_log,
                                 scale='binwidth',
                                 err_type='gaussian',
                                 ax=axes_hist[0][2])

    axes_hist[1][0].set_title('Freedman Diaconis')
    axes_hist[1][0].set_ylabel('Entries/Bin Width', fontsize=20)
    hist_fd_bw = skh_plt.hist(x=data_nom,
                              histtype='stepfilled',
                              bins='fd',
                              errorbars=False,
                              alpha=0.5,
                              log=do_log,
                              scale='binwidth',
                              err_type='gaussian',
                              ax=axes_hist[1][0])

    axes_hist[1][1].set_title('Knuth')
    _, bk = knuth_bin_width(data_nom, return_bins=True)
    hist_knuth_bw = skh_plt.hist(x=data_nom,
                                 histtype='stepfilled',
                                 bins=bk,
                                 errorbars=False,
                                 alpha=0.5,
                                 log=do_log,
                                 scale='binwidth',
                                 err_type='gaussian',
                                 ax=axes_hist[1][1])

    axes_hist[1][2].set_title('Rice')
    hist_rice_bw = skh_plt.hist(x=data_nom,
                                histtype='stepfilled',
                                bins='rice',
                                errorbars=False,
                                alpha=0.5,
                                log=do_log,
                                scale='binwidth',
                                err_type='gaussian',
                                ax=axes_hist[1][2])

    axes_hist[2][0].set_title('Sqrt(N)')
    hist_sqrt_bw = skh_plt.hist(x=data_nom,
                                histtype='stepfilled',
                                bins='sqrt',
                                errorbars=False,
                                alpha=0.5,
                                log=do_log,
                                scale='binwidth',
                                err_type='gaussian',
                                ax=axes_hist[2][0])

    # bep = bep_optimizer(data_nom)
    # _, bep = pd.qcut(data_nom, nep, retbins=True)

    hist_sturges = np.histogram(data_nom, bins='sturges')
    hist_doane = np.histogram(data_nom, bins='doane')
    hist_scott = np.histogram(data_nom, bins='scott')
    hist_fd = np.histogram(data_nom, bins='fd')
    hist_knuth = np.histogram(data_nom, bins=bk)
    hist_rice = np.histogram(data_nom, bins='rice')
    hist_sqrt = np.histogram(data_nom, bins='sqrt')

    r_sturges = rough(hist_sturges_bw, plot=False)
    r_doane = rough(hist_doane_bw)
    r_scott = rough(hist_scott_bw)
    r_fd = rough(hist_fd_bw)
    r_knuth = rough(hist_knuth_bw, plot=False)
    r_rice = rough(hist_rice_bw)
    r_sqrt = rough(hist_sqrt_bw, plot=False)

    eli_sturges = err_li(data_nom, hist_sturges)
    eli_doane = err_li(data_nom, hist_doane)
    eli_scott = err_li(data_nom, hist_scott)
    eli_fd = err_li(data_nom, hist_fd)
    eli_knuth = err_li(data_nom, hist_knuth)
    eli_rice = err_li(data_nom, hist_rice)
    eli_sqrt = err_li(data_nom, hist_sqrt)

    avg_eli_sturges = []
    avg_eli_doane = []
    avg_eli_scott = []
    avg_eli_fd = []
    avg_eli_knuth = []
    avg_eli_rice = []
    avg_eli_sqrt = []
    for i in resample_list:
        avg_eli_sturges.append(err_li(i, hist_sturges))
        avg_eli_doane.append(err_li(i, hist_doane))
        avg_eli_scott.append(err_li(i, hist_scott))
        avg_eli_fd.append(err_li(i, hist_fd))
        avg_eli_knuth.append(err_li(i, hist_knuth))
        avg_eli_rice.append(err_li(i, hist_rice))
        avg_eli_sqrt.append(err_li(i, hist_sqrt))

    avg_eli_sturges = np.mean(avg_eli_sturges)
    avg_eli_doane = np.mean(avg_eli_doane)
    avg_eli_scott = np.mean(avg_eli_scott)
    avg_eli_fd = np.mean(avg_eli_fd)
    avg_eli_knuth = np.mean(avg_eli_knuth)
    avg_eli_rice = np.mean(avg_eli_rice)
    avg_eli_sqrt = np.mean(avg_eli_sqrt)

    avg_eli_list = [
        avg_eli_sturges, avg_eli_doane, avg_eli_scott, avg_eli_fd,
        avg_eli_knuth, avg_eli_rice, avg_eli_sqrt
    ]
    r_list = [r_sturges, r_doane, r_scott, r_fd, r_knuth, r_rice, r_sqrt]

    elis_list = [
        eli_sturges, eli_doane, eli_scott, eli_fd, eli_knuth, eli_rice,
        eli_sqrt
    ]

    axes_hist[2][1].set_title('Equal Population')
    bep = bep_optimizer(data_nom, resample_list, r_list, avg_eli_list)
    hist_ep_bw = skh_plt.hist(x=data_nom,
                              histtype='stepfilled',
                              bins=bep,
                              errorbars=False,
                              alpha=0.5,
                              log=do_log,
                              scale='binwidth',
                              err_type='gaussian',
                              ax=axes_hist[2][1])
    hist_ep = np.histogram(data_nom, bins=bep)
    r_ep = rough(hist_ep_bw)
    eli_ep = err_li(data_nom, hist_ep)
    avg_eli_ep = []
    for i in resample_list:
        avg_eli_ep.append(err_li(i, hist_ep))
    avg_eli_ep = np.mean(avg_eli_ep)

    axes_hist[2][2].set_title('Bayesian Blocks')
    p0 = bb_optimizer(data_nom, resample_list, r_list, avg_eli_list)
    bb = bayesian_blocks(data_nom, p0=p0)
    if xlims:
        bb[0] = xlims[0]
        bb[-1] = xlims[-1]
    hist_bb_bw = skh_plt.hist(x=data_nom,
                              histtype='stepfilled',
                              bins=bb,
                              errorbars=False,
                              alpha=1,
                              log=do_log,
                              scale='binwidth',
                              err_type='gaussian',
                              ax=axes_hist[2][2])
    # if n_events == 1000 and dist_name == '2LP':
    # axes_hist[2][2].set_ylim((0, 100))
    hist_bb = np.histogram(data_nom, bins=bb)
    r_bb = rough(hist_bb_bw, plot=False)
    eli_bb = err_li(data_nom, hist_bb)
    avg_eli_bb = []
    for i in resample_list:
        avg_eli_bb.append(err_li(i, hist_bb))
    avg_eli_bb = np.mean(avg_eli_bb)

    r_list.append(r_ep)
    r_list.append(r_bb)
    avg_eli_list.append(avg_eli_ep)
    avg_eli_list.append(avg_eli_bb)
    elis_list.append(eli_ep)
    elis_list.append(eli_bb)
    plt.savefig(bb_dir + f'/plots/bin_comp/hists_{dist_name}_{n_events}.pdf')

    xs = [
        'Sturges', 'Doane', 'Scott', 'FD', 'Knuth', 'Rice', 'Sqrt', 'EP', 'BB'
    ]

    fig_metric, axes_metric = plt.subplots(2, 1, constrained_layout=True)
    fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}')
    for i in range(len(elis_list)):
        if xs[i] == 'BB':
            axes_metric[0].scatter(avg_eli_list[i],
                                   r_list[i],
                                   label=xs[i],
                                   s=400,
                                   marker='*',
                                   c='k')
        else:
            axes_metric[0].scatter(avg_eli_list[i],
                                   r_list[i],
                                   label=xs[i],
                                   s=200)
    axes_metric[0].set_ylabel(r'$W_n$ (Wiggles)')
    axes_metric[0].set_xlabel(r'$\hat{E}$ (Average Error)')
    # ax = plt.gca()
    # ax.set_yscale('log')
    # ax.set_xscale('log')
    # ax.relim()
    # ax.autoscale_view()
    axes_metric[0].grid()
    axes_metric[0].legend(ncol=1,
                          bbox_to_anchor=(1.05, 1.15),
                          loc='upper left')
    axes_metric[0].set_title(f'{dist_name} Distribution, N={n_events}',
                             fontsize=22)
    # plt.savefig(bb_dir+f'/plots/bin_comp/scat_{dist_name}_{n_events}.pdf')

    # plt.figure()
    rank_rough = rankdata(r_list, method='min')
    rank_avg_eli = rankdata(avg_eli_list, method='min')

    cont = axes_metric[1].bar(xs,
                              rank_rough,
                              0.35,
                              label=r'$W_n$ Ranking',
                              alpha=0.5)
    cont[-1].set_alpha(1)
    cont = axes_metric[1].bar(xs,
                              rank_avg_eli,
                              0.35,
                              bottom=rank_rough,
                              label=r'$\hat{E}$ Ranking',
                              alpha=0.5)
    cont[-1].set_alpha(1)
    axes_metric[1].legend(loc='upper left', bbox_to_anchor=(1.0, 0.8))
    # axes_metric[1].set_title(f'Combined Ranking, {dist_name} Distribution, N={n_events}')
    axes_metric[1].set_xlabel('Binning Method')
    axes_metric[1].set_ylabel('Rank')
    plt.savefig(bb_dir + f'/plots/bin_comp/metric_{dist_name}_{n_events}.pdf')