def test_parameter_estimation(self):
     X = np.random.uniform(0, 4, 1000)
     y = X + np.random.normal(0, 1, 1000)
     m = BayesianBootstrapBagging(LinearRegression(),
                                  10000,
                                  1000,
                                  low_mem=False)
     m.fit(X.reshape(-1, 1), y)
     coef_samples = [b.coef_ for b in m.base_models_]
     intercept_samples = [b.intercept_ for b in m.base_models_]
     self.assertAlmostEqual(np.mean(coef_samples), 1, delta=0.3)
     l, r = central_credible_interval(coef_samples, alpha=0.05)
     self.assertLess(l, 1)
     self.assertGreater(r, 1)
     l, r = highest_density_interval(coef_samples, alpha=0.05)
     self.assertLess(l, 1)
     self.assertGreater(r, 1)
     self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3)
     l, r = central_credible_interval(intercept_samples, alpha=0.05)
     self.assertLess(l, 0)
     self.assertGreater(r, 0)
     self.assertAlmostEqual(np.mean(intercept_samples), 0, delta=0.3)
     l, r = highest_density_interval(intercept_samples, alpha=0.05)
     self.assertLess(l, 0)
     self.assertGreater(r, 0)
Esempio n. 2
0
 def test_hpdi(self):
     l, r = highest_density_interval(self._shuffle([0, 10, 1] + [1.1] * 7),
                                     alpha=0.2)
     self.assertEqual(l, 1)
     self.assertEqual(r, 1.1)
     l, r = highest_density_interval(self._shuffle([0, 10, 1.1, 1]),
                                     alpha=0.5)
     self.assertEqual(l, 1)
     self.assertEqual(r, 1.1)
def plot_group_hdis(samples, labels, alpha, n_replications):
    for i, (s, l) in enumerate(zip(samples, labels)):
        posterior = mean(s, n_replications)
        l, r = highest_density_interval(posterior)
        plt.plot([i, i], [l, r])
        plt.plot([i], [np.mean(posterior)], marker='o')
    plt.xticks(range(len(labels)), labels)
def bb_hdi(a_bootstrap, b_bootstrap, alpha=0.05):
    """Calculate a 1-alpha high density interval

    Args:
        a_bootstrap: a list of resampled means from page A journeys.
        b_bootstrap: a list of resampled means from page B journeys.
        alpha: false positive rate.

    Returns:
        a_ci_low: the lower point of the 1-alpha% highest density interval for A.
        a_ci_hi: the higher point of the 1-alpha% highest density interval for A.
        b_ci_low: the lower point of the 1-alpha% highest density interval for B.
        b_ci_hi: the higher point of the 1-alpha% highest density interval for B.
        ypa_diff_mean: the mean difference for the posterior between A's and B's distributions.
        ypa_diff_ci_low: lower hdi for posterior of the difference.
        ypa_diff_ci_hi: upper hdi for posterior of the difference.
        prob_b_>_a: number of values greater than 0 divided by num of obs for mean diff posterior. Or
        the probability that B's mean metric was greater than A's mean metric.
        """
    # Calculate a 95% HDI
    a_ci_low, a_ci_hi = bb.highest_density_interval(a_bootstrap, alpha=alpha)
    # Calculate a 95% HDI
    b_ci_low, b_ci_hi = bb.highest_density_interval(b_bootstrap, alpha=alpha)

    # calculate the posterior for the difference between A's and B's mean of resampled means
    # ypa prefix is vestigial from blog post
    ypa_diff = np.array(b_bootstrap) - np.array(a_bootstrap)
    ypa_diff_mean = ypa_diff.mean()
    # get the hdi
    ypa_diff_ci_low, ypa_diff_ci_hi = bb.highest_density_interval(ypa_diff,
                                                                  alpha=alpha)
    # We count the number of values greater than 0 and divide by the total number
    # of observations
    # which returns us the the proportion of values in the distribution that are
    # greater than 0
    p_value = (ypa_diff > 0).sum() / ypa_diff.shape[0]

    return {
        'a_ci_low': a_ci_low,
        'a_ci_hi': a_ci_hi,
        'b_ci_low': b_ci_low,
        'b_ci_hi': b_ci_hi,
        'diff_mean': ypa_diff_mean,
        'diff_ci_low': ypa_diff_ci_low,
        'diff_ci_hi': ypa_diff_ci_hi,
        'prob_b_>_a': p_value
    }
 def bootstrap():
     print(X, round(df[X].mean(), 2))
     player_bootstrap = bb.mean(df[X], n_replications=10000)
     ci_low, ci_hi = bb.highest_density_interval(player_bootstrap)
     print('low ci:', round(ci_low, 2), 'high ci:', round(ci_hi, 2))
     sns.distplot(player_bootstrap)
     plt.show()
     plt.close()
Esempio n. 6
0
def plot_mean_bootstrap_exponential_readme():
    X = np.random.exponential(7, 4)
    classical_samples = [np.mean(resample(X)) for _ in range(10000)]
    posterior_samples = mean(X, 10000)
    l, r = highest_density_interval(posterior_samples)
    classical_l, classical_r = highest_density_interval(classical_samples)
    plt.subplot(2, 1, 1)
    plt.title('Bayesian Bootstrap of mean')
    sns.distplot(posterior_samples, label='Bayesian Bootstrap Samples')
    plt.plot([l, r], [0, 0], linewidth=5.0, marker='o', label='95% HDI')
    plt.xlim(-1, 18)
    plt.legend()
    plt.subplot(2, 1, 2)
    plt.title('Classical Bootstrap of mean')
    sns.distplot(classical_samples, label='Classical Bootstrap Samples')
    plt.plot([classical_l, classical_r], [0, 0], linewidth=5.0, marker='o', label='95% HDI')
    plt.xlim(-1, 18)
    plt.legend()
    plt.savefig('readme_exponential.png', bbox_inches='tight')
Esempio n. 7
0
    def run(self):
        data_loader = ExampleLoader()

        sample = data_loader.sample()

        step_num = 0
        scatter_time = []
        while sample is not False:
            scatter_time.append([data_loader.data_index - 1] * len(sample))
            self._samples.append(sample)

            if len(self._reference) < self._reference.maxlen:
                # Wait until it collects full stack of reference window.
                self._reference.append(sample)
                self._change_point_score.append([0] * self._re_sample_trial)
                self._gamma.append(0)
            elif len(self._test) < self._test.maxlen:
                # Wait until it collects full stack of test window.
                self._test.append(sample)
                self._change_point_score.append([0] * self._re_sample_trial)
                self._gamma.append(0)
            else:
                _sample = self._test.popleft()
                self._reference.append(_sample)
                self._test.append(sample)

                _change_point_score = []
                for _idx in range(0, self._re_sample_trial):
                    _reference = define_signatures_set(self._reference)
                    _test = define_signatures_set(self._test)

                    if self._score_method == 'log likelihood':
                        _cp_score = calculate_log_likelihood_cp_score(
                            _reference, _test)
                    elif self._score_method == 'symmetrized kl':
                        _cp_score = calculate_kl_cp_score(_reference, _test)
                    else:
                        print("Not allowed scoring method: {}.".format(
                            self._score_method))
                        print(
                            "\tAccepted values for score method: 'log likelihood' or 'symmetrized kl"
                        )
                        break
                    _change_point_score.append(_cp_score)

                # Calculate confidence interval (standard error) for cp score mean
                _test_density = mean(_change_point_score, 10000)
                _test_low, _test_up = highest_density_interval(_test_density,
                                                               alpha=0.001)

                self._change_point_score.append(_change_point_score)

                if len(self._xi_up) < self._xi_up.maxlen:
                    self._xi_up.append(_test_up)
                    self._gamma.append(0)
                else:
                    _gamma = _test_low - self._xi_up.popleft()

                    self._xi_up.append(_test_up)
                    self._gamma.append(_gamma)

                    if _gamma > 0:
                        self._change_point_index.append(step_num)
                        self._continuous_cp_alarm.append(step_num)
                        if len(self._continuous_cp_alarm
                               ) == self._continuous_cp_alarm.maxlen:
                            if self._continuous_cp_alarm[
                                    0] == step_num - self._continuous_cp_alarm.maxlen + 1:
                                if len(self._change_point_alarm_index) > 0:
                                    if self._change_point_alarm_index[
                                            -1] in self._continuous_cp_alarm:
                                        continue
                                self._change_point_alarm_index.append(
                                    self._continuous_cp_alarm[0])

            sample = data_loader.sample()
            step_num += 1

        # Drawing
        plt.figure(figsize=[15, 4])
        plt.scatter(scatter_time,
                    self._samples,
                    marker='o',
                    s=10,
                    c='black',
                    alpha=0.1)
        plt.title('Data')
        plt.xlabel('Time')
        plt.ylabel('Y')
        plt.xlim([0, step_num + 2])
        plt.xticks(np.arange(0, step_num + 2, 50))
        for cp in self._change_point_alarm_index:
            if self._score_method == 'log likelihood':
                plt.axvline(x=cp - self._tau + 1, color='red', ls='--', lw=1)
            else:
                plt.axvline(x=cp - int(np.ceil(self._tau / 2)) + 1,
                            color='red',
                            ls='--',
                            lw=1)
        plt.show()

        plt.figure(figsize=[15, 4])
        mu = np.mean(self._change_point_score, axis=1)
        std = np.std(self._change_point_score, axis=1)

        plt.plot(mu, ls='--', c='black', alpha=1.0)
        plt.fill_between(np.arange(0, step_num),
                         mu - 2 * std,
                         mu + 2 * std,
                         color='black',
                         alpha=0.3)
        plt.title('Change Point Score')
        plt.xlabel('Time')
        plt.ylabel('Change Point Score')
        plt.xlim([0, step_num + 2])
        plt.xticks(np.arange(0, step_num + 2, 50))
        for cp in self._change_point_alarm_index:
            plt.axvline(x=cp, color='red', ls='--', lw=1)
        plt.show()

        plt.figure(figsize=[15, 4])
        plt.plot(self._gamma, ls='--', c='black', alpha=1.0)
        plt.title('Gamma')
        plt.xlabel('Time')
        plt.ylabel('Gamma')
        plt.xlim([0, step_num + 2])
        plt.xticks(np.arange(0, step_num + 2, 50))
        plt.axhline(y=0, color='green', lw=2, ls='--')
        for cp in self._change_point_alarm_index:
            plt.axvline(x=cp, color='red', ls='--', lw=1)
        plt.show()
Esempio n. 8
0
pip install bayesian_bootstrap

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
X = np.random.exponential(7, 4)

# +
from bayesian_bootstrap.bootstrap import mean, highest_density_interval, BayesianBootstrapBagging
posterior_samples = mean(X, 10000)
l, r = highest_density_interval(posterior_samples)

plt.title('Bayesian Bootstrap of mean')
sns.distplot(posterior_samples, label='Bayesian Bootstrap Samples')
plt.plot([l, r], [0, 0], linewidth=5.0, marker='o', label='95% HDI')
# -

from bayesian_bootstrap.bootstrap import bayesian_bootstrap
posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100)

X = np.random.normal(0, 1, 5).reshape(-1, 1)
y = X.reshape(1, -1).reshape(5) + np.random.normal(0, 1, 5)

m = BayesianBootstrapBagging(LinearRegression(), 10000, 1000)
m.fit(X, y)

import utils

p_pre, p_post = utils.load_users_covid('politicians')
rfriends_pre, rfriends_post = utils.load_users_covid('random-friends')
rfollowers_pre, rfollowers_post = utils.load_users_covid('random-followers')