예제 #1
0
    def test_xgb(self):
        sample_size_cohort = np.int(
            np.floor(len(self.test_data_cohort) * 4 / 5))
        sample_size_control = np.int(
            np.floor(len(self.test_data_control) * 4 / 5))
        auc = []
        auprc = []
        for i in range(self.boost_iteration):
            test_cohort = resample(self.test_data_cohort,
                                   n_samples=sample_size_cohort)
            test_control = resample(self.test_data_control,
                                    n_samples=sample_size_control)
            self.aquire_batch_data_cohort(0, test_cohort, len(test_cohort))
            self.aquire_batch_data_control(0, test_control, len(test_control))
            self.aquire_batch_data_whole()
            # print(self.lr.score(self.one_batch_data,self.one_batch_logit))
            auc.append(
                roc_auc_score(
                    self.one_batch_logit_whole,
                    self.xg_model.predict_proba(self.one_batch_data_whole)[:,
                                                                           1]))
            auprc.append(
                average_precision_score(
                    self.one_batch_logit_whole,
                    self.xg_model.predict_proba(self.one_batch_data_whole)[:,
                                                                           1]))

        print("auc")
        print(bs.bootstrap(np.array(auc), stat_func=bs_stats.mean))
        print("auprc")
        print(bs.bootstrap(np.array(auprc), stat_func=bs_stats.mean))
예제 #2
0
def test_pivotal(self):
    mean = 100
    stdev = 10

    test = np.random.normal(loc=mean, scale=stdev, size=500)
    ctrl = np.random.normal(loc=mean, scale=stdev, size=5000)
    test = test * 1.1

    bsr = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change)

    bsr_percent = bs.bootstrap_ab(test,
                                  ctrl,
                                  bs_stats.mean,
                                  bs_compare.percent_change,
                                  is_pivotal=False)
    self.assertAlmostEqual(bsr.value, bsr_percent.value, delta=.1)

    self.assertAlmostEqual(bsr.lower_bound, bsr_percent.lower_bound, delta=.1)

    self.assertAlmostEqual(bsr.upper_bound, bsr_percent.upper_bound, delta=.1)

    bsr = bs.bootstrap(test, bs_stats.mean)

    bsr_percent = bs.bootstrap(test, bs_stats.mean, num_threads=10)
    self.assertAlmostEqual(bsr.value, bsr_percent.value, delta=.1)

    self.assertAlmostEqual(bsr.lower_bound, bsr_percent.lower_bound, delta=.1)

    self.assertAlmostEqual(bsr.upper_bound, bsr_percent.upper_bound, delta=.1)
def eval_stats(exp, pred, N):
    error = np.abs(exp - pred)
    rmse_bootstrap_dist = bs.bootstrap(np.reshape(error, (N, -1)),
                                       stat_func=RMSE_function,
                                       num_iterations=1000,
                                       alpha=0.05,
                                       is_pivotal=True,
                                       return_distribution=True)
    rmse = np.sqrt(np.mean(error**2))
    tau = scipy.stats.kendalltau(exp, pred)[0]
    tau_bootstrap_dist = bs.bootstrap(np.reshape(
        np.array(list(zip(exp, pred))), (N, -1)),
                                      stat_func=tau_function,
                                      num_iterations=1000,
                                      alpha=0.05,
                                      is_pivotal=True,
                                      return_distribution=True)
    tmp1 = [
        rmse,
        np.percentile(rmse_bootstrap_dist, 2.5),
        np.percentile(rmse_bootstrap_dist, 97.5), tau,
        np.percentile(tau_bootstrap_dist, 2.5),
        np.percentile(tau_bootstrap_dist, 97.5)
    ]
    return tmp1
예제 #4
0
 def test(self):
     sample_size_cohort = np.int(
         np.floor(len(self.test_data_cohort) * 4 / 5))
     sample_size_control = np.int(
         np.floor(len(self.test_data_control) * 4 / 5))
     auc = []
     auprc = []
     for i in range(self.boost_iteration):
         print(i)
         test_cohort = resample(self.test_data_cohort,
                                n_samples=sample_size_cohort)
         test_control = resample(self.test_data_control,
                                 n_samples=sample_size_control)
         test_data = test_cohort + test_control
         logit_test = np.zeros(len(test_cohort) + len(test_control))
         logit_test[0:len(test_cohort)] = 1
         self.aquire_batch_data(0, test_data, len(test_data), logit_test)
         # print(self.lr.score(self.one_batch_data,self.one_batch_logit))
         self.out_logit = self.sess.run(
             self.logit_sig, feed_dict={self.input_x: self.one_batch_data})
         #self.init_hiddenstate: init_hidden_state})
         #self.input_x_static: self.one_batch_data_static})
         auc.append(roc_auc_score(self.one_batch_logit, self.out_logit))
         auprc.append(
             average_precision_score(self.one_batch_logit, self.out_logit))
     print("auc")
     print(bs.bootstrap(np.array(auc), stat_func=bs_stats.mean))
     print("auprc")
     print(bs.bootstrap(np.array(auprc), stat_func=bs_stats.mean))
예제 #5
0
    def test_bootstrap_ratio(self):
        denom = np.array(([10] * 100) + ([1 / 10.] * 100))
        samples = np.array((([1 / 10.] * 100) + [10] * 100))

        bsr = bs.bootstrap(samples, bs_stats.mean, denominator_values=denom)

        self.assertAlmostEqual(bsr.value, 1, delta=.1)

        bsr = bs.bootstrap(samples / denom, bs_stats.mean)
        self.assertAlmostEqual(bsr.value, 50, delta=5)
예제 #6
0
def calculate_real_gain(real_inputs, real_outputs, sampling_size,
                        input_distance_type, output_distance_type, model):
    gains = []
    while len(gains) < sampling_size:
        subsampling_idxs = random.sample(
            range(len(real_outputs)), min(sampling_size * 2, len(real_outputs))
        )  #numpy.random.randint(0, len(outputs), (args.sampling_size*2,))

        if len(subsampling_idxs) % 2 != 0:
            subsampling_idxs = subsampling_idxs[1:]

        batch1 = subsampling_idxs[:int(len(subsampling_idxs) / 2)]
        batch2 = subsampling_idxs[int(len(subsampling_idxs) / 2):]
        inputs1 = [real_inputs[i] for i in batch1]
        inputs2 = [real_inputs[i] for i in batch2]
        outputs1 = [real_outputs[i] for i in batch1]
        outputs2 = [real_outputs[i] for i in batch2]

        if input_distance_type == "infersent-cosine":
            inputs1 = model.encode(inputs1,
                                   bsize=128,
                                   tokenize=False,
                                   verbose=True)
            inputs2 = model.encode(inputs2,
                                   bsize=128,
                                   tokenize=False,
                                   verbose=True)

        if output_distance_type == "infersent-cosine":
            outputs1 = model.encode(outputs1,
                                    bsize=128,
                                    tokenize=False,
                                    verbose=True)
            outputs2 = model.encode(outputs2,
                                    bsize=128,
                                    tokenize=False,
                                    verbose=True)

        for in1, in2, out1, out2 in zip(inputs1, inputs2, outputs1, outputs2):
            input_distance = distance(in1,
                                      in2,
                                      distance_type=input_distance_type)
            output_distance = distance(out1,
                                       out2,
                                       distance_type=output_distance_type)
            gain = output_distance / (input_distance + EPS)
            gains.append(gain)

    gains = np.array(gains)
    # calculate bootstrap estimates for the mean and standard deviation# calcu
    mean_results = bs.bootstrap(gains, stat_func=bs_stats.mean)

    # see advanced_bootstrap_features.ipynb for a discussion of how to use the stat_func arg
    stdev_results = bs.bootstrap(gains, stat_func=bs_stats.std)
    return mean_results, stdev_results
예제 #7
0
def dist_plot(imgt, imgc, trt_name, ctr_name, meas, qname, unit, xdelta,
              binstep):
    if (unit != ""):
        unit = " [" + unit + "]"

    h = np.linspace(
        min(np.amin(imgt[meas + "_true"]), np.amin(imgc[meas + "_true"])),
        max(np.amax(imgt[meas + "_true"]), np.amax(imgc[meas + "_true"])), 11)

    print("Treated vs control for ", meas, ":")
    print(">:",
          np.sum(imgt[meas + "_est"].values > imgc[meas + "_est"].values))
    print("<:",
          np.sum(imgt[meas + "_est"].values < imgc[meas + "_est"].values))

    sw, pval = spst.wilcoxon(imgt[meas + "_err"], imgc[meas + "_err"])
    print("Pairwise difference:", trt_name, "-", ctr_name, "for", meas, unit)
    print('Wilcoxon (t, pval): %.3lf, %.5lf' % (sw, pval))
    print("Treated:", np.mean(imgt[meas + "_est"]), unit, "vs untreated:",
          np.mean(imgc[meas + "_est"]), unit)

    print("Effect strength:", bs.bootstrap(imgt[meas+"_est"].values - imgc[meas+"_est"].values,\
                              stat_func=bs_stats.mean, alpha=0.05, num_iterations=10000))

    imgt[meas + "_rel_diff"] = 2 * (
        imgt[meas + "_est"].values - imgc[meas + "_est"].values) / (
            imgt[meas + "_true"].values + imgc[meas + "_true"].values)
    sw, pval = spst.wilcoxon(imgt[meas + "_rel_diff"])
    print("\nRelative pairwise difference:", trt_name, "-", ctr_name, "for",
          meas, '%')
    print('Wilcoxon (t, pval): %.3lf, %.5lf' % (sw, pval))
    print(
        "Effect strength:",
        bs.bootstrap(imgt[meas + "_rel_diff"].values,
                     stat_func=bs_stats.mean,
                     alpha=0.05,
                     num_iterations=10000))

    constant_bins = range(-xdelta, xdelta, binstep)
    sns.distplot(imgt[meas + "_est"].values - imgc[meas + "_est"].values,
                 bins=constant_bins,
                 color=get_meas_color(meas)[0])
    plt.axvline(x=0, color="black", linewidth='1.0', linestyle="dashed")
    plt.xlim(-xdelta, xdelta)
    plt.xlabel("Estimated within-pair %s diff.%s: %s minus %s" %
               (qname, unit, trt_name, ctr_name),
               fontsize=fslegend)
    plt.ylabel('Relative frequency', fontsize=fslegend)
    plt.xticks(fontsize=fsticks)
    plt.yticks(fontsize=fsticks)
    plt.legend(fontsize=fslegend)
    plt.tight_layout()
    plt.savefig('Plots/' + 'plot_dist_' + meas + '_pairwise_est_clean_' +
                trt_name + '_' + ctr_name + '.pdf')
    plt.show()
예제 #8
0
    def confidence_intervals(self, do_new_class_ci=False, alpha=0.05):
        # calculate bootstrap estimates for the mean and standard deviation
        ci_obj = bs.bootstrap(np.array(self.trial_results), stat_func=bs_stats.mean, alpha=alpha)
        m = (ci_obj.value, ci_obj.lower_bound, ci_obj.upper_bound)

        if do_new_class_ci:
            nc_obj = bs.bootstrap(np.array(self.new_class_results), stat_func=bs_stats.mean, alpha=alpha)
            nc = (nc_obj.value, nc_obj.lower_bound, nc_obj.upper_bound)
        else:
            nc = (0, 0, 0)

        return m, nc
예제 #9
0
 def apply_bootstrap(self, data):
     '''
     ToDo: data_clear
     '''
     data_clear = data.strip()
     bs.bootstrap(data_clear[(data_clear['VARIANT_NAME'] == 'control'
                              )].groupby('USER_ID').action.sum().values,
                  stat_func=bs_stats.mean,
                  num_iterations=10000,
                  iteration_batch_size=300,
                  return_distribution=True)
     return list()
예제 #10
0
    def test_bootstrap_batch_size(self):
        mean = 100
        stdev = 10

        test = np.random.normal(loc=mean, scale=stdev, size=500)
        ctrl = np.random.normal(loc=mean, scale=stdev, size=5000)
        test = test * 1.1

        bsr = bs.bootstrap_ab(test, ctrl, bs_stats.mean,
                              bs_compare.percent_change)

        bsr_batch = bs.bootstrap_ab(test, ctrl, bs_stats.mean,
                                    bs_compare.percent_change,
                                    iteration_batch_size=10)
        self.assertAlmostEqual(
            bsr.value,
            bsr_batch.value,
            delta=.1
        )

        self.assertAlmostEqual(
            bsr.lower_bound,
            bsr_batch.lower_bound,
            delta=.1
        )

        self.assertAlmostEqual(
            bsr.upper_bound,
            bsr_batch.upper_bound,
            delta=.1
        )

        bsr = bs.bootstrap(test, bs_stats.mean)

        bsr_batch = bs.bootstrap(test, bs_stats.mean,
                                 iteration_batch_size=10)
        self.assertAlmostEqual(
            bsr.value,
            bsr_batch.value,
            delta=.1
        )

        self.assertAlmostEqual(
            bsr.lower_bound,
            bsr_batch.lower_bound,
            delta=.1
        )

        self.assertAlmostEqual(
            bsr.upper_bound,
            bsr_batch.upper_bound,
            delta=.1
        )
예제 #11
0
def getBootstrapHellKl(beta1, beta2, density, bootstrapSampleSize):
    resultListKL = []
    resultListHell = []
    for i in range(bootstrapSampleSize):
        resultListHell.append(
            distanceMetrics.hellinger1(beta1.getDistribution(density),
                                       beta2.getDistribution(density)))
        resultListKL.append(
            distanceMetrics.dkl(beta1.getDistribution(density),
                                beta2.getDistribution(density)))
    rBKL = bs.bootstrap(numpy.array(resultListKL), stat_func=bs_stats.mean)
    rBHell = bs.bootstrap(numpy.array(resultListHell), stat_func=bs_stats.mean)
    return rBKL, rBHell
예제 #12
0
def generate_rastrigin_statistics(pop_size, runs=30, n=10):
    m = Rastrigin(n)

    final_best_objective = []
    final_best_sol = []
    test_best_fitness = []
    test_mean_fitness = []
    final_mean_fitness = []

    for i in range(runs):
        ga_instance = GA([-5.12] * n, [5.12] * n,
                         m.f,
                         pop_size=pop_size,
                         num_bits=20)
        ga_instance.run()

        ga_instance.save_results(i)

        mean_fitness = [np.mean(v) for v in ga_instance.generation_fitness]
        best_fitness = [np.max(v) for v in ga_instance.generation_fitness]

        test_best_fitness.append(best_fitness)
        test_mean_fitness.append(mean_fitness)

        final_best_objective.append(ga_instance.best_objective)
        final_best_sol.append(ga_instance.best_solution)
        final_mean_fitness.append(
            ga_instance.descale(np.mean(ga_instance.population_fitness)))

        print('BEST SOL: {}'.format(ga_instance.best_solution))
        print('BEST FOBJ: {}'.format(ga_instance.best_objective))
        print('=================================================')

    bs_best_fitness = bs.bootstrap(np.array(final_best_objective),
                                   stat_func=bs_stats.mean)

    bs_mean_fitness = bs.bootstrap(np.array(final_mean_fitness),
                                   stat_func=bs_stats.mean)

    # print(statistics.describe())
    print('Melhor solução final: {} CI 95% ({}, {})'.format(
        bs_best_fitness.value, bs_best_fitness.lower_bound,
        bs_best_fitness.upper_bound))

    print('Melhor solução MÉDIA final: {} CI 95% ({}, {})'.format(
        bs_mean_fitness.value, bs_mean_fitness.lower_bound,
        bs_mean_fitness.upper_bound))

    return test_best_fitness, test_mean_fitness, bs_best_fitness, bs_mean_fitness, final_best_sol
예제 #13
0
def main():
    size = INPUT_SHAPE[0]
    num_trials = 30
    samples = []
    for _ in range(num_trials):
        results = []
        for _ in range(100):
            params, img = noisy_circle(size, RADIUS, 2)
            params = list(params)
            detected_center = center_predictor.predict([
                np.array([np.expand_dims(img, -1)]),
                np.array([np.expand_dims(img, -1)]),
            ])[0]
            detected_radius = radius_predictor.predict([
                np.array([np.expand_dims(img, -1)]),
                np.array([np.expand_dims(img, -1)]),
            ])[0]
            detected = [
                detected_center.tolist()[0],
                detected_center.tolist()[1]
            ] + detected_radius.tolist()
            ret = iou(params, detected)
            results.append(ret)
        results = np.array(results)
        precision = (results > 0.7).mean()
        samples.append(precision)
    samples = np.array(samples)
    bs_ret = bs.bootstrap(samples, stat_func=bs_stats.mean, alpha=0.05)
    print(bs_ret)
예제 #14
0
def main():
    """The main function."""
    args = parse_args()
    plot_data_paths = args.plot_data
    num_plot_datas = len(plot_data_paths)

    aucs = []

    for plot_data_path in plot_data_paths:
        if plot_data_path.stat().st_size == 0:
            continue

        with plot_data_path.open() as inf:
            df = read_plot_data(inf)
            if df.empty:
                continue

        df['unix_time'] = df.unix_time - df.unix_time.iloc[0]

        total_cov = df.map_size.iloc[-1]
        percentile_cov = total_cov * args.percentile
        df_percentile = df[df.map_size <= percentile_cov]
        if len(df_percentile) < 2:
            df_percentile = df[0:2]

        auc = metrics.auc(df_percentile.unix_time, df_percentile.map_size)
        aucs.append(auc)

    # Compute the mean AUC and confidence intervals
    auc_ci = bs.bootstrap(np.array(aucs), stat_func=bs_stats.mean)
    print(f'mean AUC ({num_plot_datas} plot_data files)')
    print(f'  {auc_ci.value:.02f} +/- {auc_ci.error_width() / 2:.02f}')
예제 #15
0
def avg_velocity_from_k(episodes, k = 0.0):
	avg_vel = []

	for e in episodes:
		steps = v[v['episode'] == e][['x', 'y']]
		steps = steps[steps['x'] >= k]
		if steps.shape[0] < 200:
			continue
		last, first = steps.iloc[-1], steps.iloc[0]
		avg_velocity = (last['x'] - first['x'])/((last.name - first.name) * STEP_TIME)
		avg_vel.append(avg_velocity)

	bs_mean_step = bs.bootstrap(np.array(avg_vel), stat_func=bs_stats.mean, alpha=0.05)
	bs_std_step = bs.bootstrap(np.array(avg_vel), stat_func=bs_stats.std, alpha=0.05)

	return bs_mean_step.value, bs_std_step.value, bs_mean_step.upper_bound, bs_mean_step.lower_bound, np.max(avg_vel), np.min(avg_vel)
def calculate_metrics(results):
    import bootstrapped.bootstrap as bs
    import bootstrapped.stats_functions as bs_stats

    stat_dict = {}

    for s in tqdm(results):
        for t in results[s]:
            for p in results[s][t]:
                if not p in stat_dict:
                    stat_dict[p] = {}
                opts = np.array([result.fun for result in results[s][t][p]])
                stats = bs.bootstrap(opts,
                                     stat_func=bs_stats.mean,
                                     num_iterations=1000000,
                                     iteration_batch_size=100000,
                                     num_threads=-1)
                l, m, u = stats.lower_bound, stats.value, stats.upper_bound
                stat_dict[p][s] = "%s<%s<%s" % tuple(
                    round(v, 3) for v in (l, m, u))

    from pandas import DataFrame
    # https://stackoverflow.com/questions/19258772/write-2d-dictionary-into-a-dataframe-or-tab-delimited-file-using-python
    df = DataFrame(stat_dict, index=list(results.keys()))
    df = df.T
    return df
예제 #17
0
def bootstrap(dataset: Dataset, combined_data: CombinedData):
    calculations = {}

    xs = combined_data.get_explanatory_variables()
    ys = combined_data.get_explained_variables()

    for y in ys:
        # for now
        assert (len(ys) == 1)

        # Main effects
        for x in xs:
            cat = [k for k, v in x.metadata[categories].items()]
            for c in cat:
                cat_data = dataset.select(
                    y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"])
                stat = bs.bootstrap(cat_data.to_numpy(),
                                    stat_func=bs_stats.median)
                calculations[c] = stat
                # import pdb; pdb.set_trace()
                # store all the medians & confidence intervals
                # return all the medians & CIs
                # data.append(cat_data)

    return calculations
예제 #18
0
def generate_rastrigin_statistics(pop_size, mutation_probability, crossover_probability, runs=30, n=10):
    m = Rastrigin(n)

    final_best_fitness = []
    final_best_sol = []
    test_best_fitness = []
    test_mean_fitness = []

    for _ in range(runs):
        ga_instance = GA([-5.12]*n, [5.12]*n, m.f,
                         num_generations=10000, mutation_probability=mutation_probability, pop_size=pop_size, crossover_probability=crossover_probability)
        ga_instance.run()

        mean_fitness = [np.mean(v) for v in ga_instance.generation_fitness]
        best_fitness = [np.max(v) for v in ga_instance.generation_fitness]

        test_best_fitness.append(best_fitness)
        test_mean_fitness.append(mean_fitness)

        final_best_fitness.append(ga_instance.best_objective)
        final_best_sol.append(ga_instance.best_objective)

    # Generate statistics table
    statistics = pd.DataFrame()
    statistics['Melhor solução final'] = np.array(final_best_fitness)

    bs_best_fitness = bs.bootstrap(
        np.array(final_best_fitness), stat_func=bs_stats.mean)

    # print(statistics.describe())
    print('Melhor solução final: {} CI 95% ({}, {})'.format(bs_best_fitness.value,
                                                            bs_best_fitness.lower_bound, bs_best_fitness.upper_bound))

    return test_best_fitness, test_mean_fitness, bs_best_fitness, final_best_sol, statistics
def QEr_Qboot(bindf, bins=[5, 10, 20, 30, 40, 50, 70, 150], silent=False):

    qbootsigs = np.zeros((np.shape(bins)[0] - 1, ))
    qbootsigerrsu = np.zeros((np.shape(bins)[0] - 1, ))
    qbootsigerrsl = np.zeros((np.shape(bins)[0] - 1, ))

    for i, Qv in enumerate(bindf):
        if not silent:
            print(np.shape(Qv))
        Qv = np.asarray(Qv)
        #print(Qv[0:10])
        try:
            bsr = bs.bootstrap(Qv,
                               stat_func=bs_stats.std,
                               iteration_batch_size=100)
        except MemoryError as e:
            print('There was a memory error - too much memory to be allocated')

        if not silent:
            print(bsr)
        qbootsigs[i] = np.std(Qv)
        qbootsigerrsu[i] = bsr.upper_bound
        qbootsigerrsl[i] = bsr.lower_bound

    #change over to size of error bars, not confidence interval
    qbootsigerrsu = qbootsigerrsu - qbootsigs
    qbootsigerrsl = -qbootsigerrsl + qbootsigs

    return qbootsigs, qbootsigerrsl, qbootsigerrsu
def learn_similar_car_from_videos(num_instances=10, fps=24, learn_new=False,
                                  percentile=5, max_samples_per_clip=30, max_frame=600):
    '''
    Learn similarity distribution from continuous frames that both contains cars
    :param num_instances: number of positive instances need to observe
    :param fps: fps used in video indexing
    :param interval: interval for compare continuous images
    :return: dictionary contains normal distribution mean and std
    '''
    this_dist_path = os.path.join(MODEL_DIR, TEST_SIGNIFICANCE_PERCENTILE)
    if os.path.exists(this_dist_path) and (not learn_new):
        with open(this_dist_path, 'r') as f:
            car_sim_dist = json.load(f)
        return car_sim_dist
    else:
        fps = fps
        n = num_instances
        ret = watch_n_random_videos(n, fps=fps, max_samples_per_clip=max_samples_per_clip, max_frame=max_frame)
        ret = np.array(list(ret))
        print(ret)
        bt_ret = bs.bootstrap(ret, stat_func=bs_stats.mean, alpha=percentile/100)
        print(dir(bt_ret))
        ci = (bt_ret.lower_bound, bt_ret.upper_bound)
        mean = bt_ret.value
        l_percentile, r_percentile = ci
        car_sim_dist = {
            'l_percentile': float(l_percentile),
            'r_percentile': float(r_percentile),
            'mean': float(mean),
        }
        print(car_sim_dist)
        with open(this_dist_path, 'w+') as f:
            json.dump(car_sim_dist, f)
        return car_sim_dist
예제 #21
0
def plot_kl(datas, interval='t'):
    fig, ax = plt.subplots(1, 1, figsize=set_size(width))
    for data in datas:
        x = np.linspace(0, 1000, data.shape[1])
        n = data.shape[0]
        if interval == 't':
            means = data.mean(axis=0)
            se = stats.sem(data, axis=0)
            low, high = stats.t.interval(0.95, n - 1, loc=means, scale=se)
        elif interval == 'bs':
            means = np.zeros(data.shape[1])
            low = np.zeros(data.shape[1])
            high = np.zeros(data.shape[1])
            for i in range(data.shape[1]):
                temp = bs.bootstrap(data[:, i],
                                    stat_func=bs_stats.mean,
                                    alpha=0.05,
                                    is_pivotal=False)
                means[i] = temp.value
                low[i] = temp.lower_bound
                high[i] = temp.upper_bound

        ax.plot(x, means)
        # ax.fill_between(x, low, high, alpha=0.2)
        ax.set_ylim([0, 0.05])
예제 #22
0
def build_radius_predictor(epoch=50):
    train_new = False
    try:
        if train_new:
            m = multi_filter_cnn(output_dim=1)
        else:
            m = load_model('c_radius.h5')
    except Exception as e:
        print(e)
        m = multi_filter_cnn(output_dim=1)
    m.compile(optimizer='adam', loss='MSE', metrics=['MAE'])
    return_original = True
    from task_env import get_samples
    buffer_size = 50
    bs_buffer = []
    while epoch:
        np.random.seed(None)
        X = []
        X_prime = []
        Y = []
        for obj in get_samples(5000,
                               norm=False,
                               return_original=return_original,
                               noise_lvl=2):
            x, y = obj
            if return_original:
                x, x_prime = x
                x_prime = np.expand_dims(x_prime, -1)
                X_prime.append(x_prime)
            x = np.expand_dims(x, -1)
            X.append(x)
            Y.append(y[-1:])

        X = np.array(X)
        Y = np.array(Y)
        X_prime = np.array(X_prime)
        # print(X.shape, Y.shape, X_prime.shape)

        # print(np.average(X_prime), np.average(Y))
        history = m.fit([
            X_prime,
            X_prime,
        ],
                        Y,
                        epochs=1,
                        validation_split=0.1,
                        batch_size=32,
                        shuffle=True,
                        verbose=2)
        m.save('c_radius.h5')

        for i in range(len(history.history['val_mean_absolute_error'])):
            bs_buffer.insert(0, history.history['val_mean_absolute_error'][i])
            while len(bs_buffer) > buffer_size:
                bs_buffer.pop(-1)

        bs_ret = bs.bootstrap(np.array(bs_buffer), stat_func=bs_stats.mean)
        print(bs_ret)
        epoch -= 1
예제 #23
0
    def test_bootstrap(self):
        mean = 100
        stdev = 10

        samples = np.random.normal(loc=mean, scale=stdev, size=5000)

        bsr = bs.bootstrap(samples, bs_stats.mean)

        self.assertAlmostEqual(bsr.value, 100, delta=2)
        self.assertAlmostEqual(bsr.upper_bound, 102, delta=2)
        self.assertAlmostEqual(bsr.lower_bound, 98, delta=2)

        bsr2 = bs.bootstrap(samples, bs_stats.mean, alpha=0.1)

        self.assertAlmostEqual(bsr.value, bsr2.value, delta=2)
        self.assertTrue(bsr.upper_bound > bsr2.upper_bound)
        self.assertTrue(bsr.lower_bound < bsr2.lower_bound)
예제 #24
0
def compute_stats(est, imgpairs):

    imgpairs["votes1"], imgpairs["votes2"] = 0.5, 0.5
    imgpairs["group"] = ""

    # CIs based on the per-images vote distribution.
    for ip, p in imgpairs.iterrows():
        cest = est.loc[est.pairname == p.img1 + p.img2]
        if (cest.shape[0] > 0):
            imgpairs.loc[ip, "group"] = cest.group.values[0]
            imgpairs.loc[ip, "votes1"] = cest.loc[
                cest.vote == p.img1].shape[0] / cest.shape[0]
            imgpairs.loc[ip, "votes2"] = cest.loc[
                cest.vote == p.img2].shape[0] / cest.shape[0]

    #estcnt = est.groupby("pairname").apply(lambda x: )

    gdv = {}
    for g in np.unique(imgpairs.group.values):
        cres = imgpairs.loc[imgpairs.group == g]
        gdv[g] = bs.bootstrap(cres.votes1.values,
                              stat_func=bs_stats.mean,
                              alpha=0.05,
                              num_iterations=10000)

    g = np.sort(np.unique(imgpairs.group.values))

    if (g[0] == ""):
        g = g[1:]

    print("Count of images per group:",
          imgpairs.groupby("group").apply(lambda x: x.shape[0]))

    # CIs based on separate resampling of votes for each image.
    means = collections.defaultdict(list)
    allgroups = np.unique(imgpairs.group.values)
    for count in range(10000):
        if (count % 100 == 0):
            print(count)
        cmeans = collections.defaultdict(list)
        for ip, p in imgpairs.iterrows():
            cest = est.loc[est.pairname == p.img1 + p.img2]
            if (cest.shape[0] == 0):
                continue
            cmeans[cest.group.values[0]].append(
                np.mean(np.random.choice(cest.vote == p.img1, 40,
                                         replace=True)))
        for group in allgroups:
            means[group].append(np.mean(cmeans[group]))

    for group in allgroups:
        pCI = np.percentile(means[group], [0, 95])
        print("%s: %.3lf (%.3lf, %.3lf)" %
              (group, np.mean(imgpairs.loc[imgpairs.group == group,
                                           "votes1"]), pCI[0], pCI[1]))

    sys.exit(1)
    return imgpairs, g, gdv
예제 #25
0
def statistical(sample, ss, sz, _alpha):

    resample = [sample[i] for i in np.random.choice(ss, min(sz, 10000))]
    bmi_sample = [te(height, weight) for (height, weight) in resample]
    res = bs.bootstrap(np.array(bmi_sample),
                       stat_func=bs_stats.mean,
                       alpha=_alpha)

    return (res.lower_bound, res.value, res.upper_bound)
예제 #26
0
def real_data_test3w():
    df = pd.read_csv("1005-ctr.sql", sep='\t')

    total_ctr = float(np.sum(df["cli_pv"])) / np.sum(df["exp_pv"])

    p_out, p_in, flag = 0, 0, 0

    z_out, z_in, z_flag = 0, 0, 0

    sample_size = 30000
    bucket_num = 50
    split_num = sample_size / bucket_num
    num_iterations = 10000
    for i in range(0, 1000):
        print("{0}th test--------------------".format(i))
        buck_index = np.floor(np.arange(0, sample_size) / split_num)
        filename1 = "data/0928A30w_{0}".format(i)

        if os.path.exists(filename1):
            sample1 = pd.read_csv(filename1, sep='\t')
        else:
            sample1 = df.sample(n=sample_size)
            sample1["bucket_index"] = buck_index
            sample1.to_csv(filename1, sep='\t')

        sample_0928 = sample1.groupby(
            ['bucket_index'])["cli_pv",
                              "exp_pv"].sum().add_suffix('_sum').reset_index()

        #####bootstrap#######

        r = bs.bootstrap(sample_0928.cli_pv_sum.values,
                         bs_stats.mean,
                         denominator_values=sample_0928.exp_pv_sum.values)

        point, low, high = r.value, r.lower_bound, r.upper_bound
        if total_ctr >= low and total_ctr <= high:
            p_in = p_in + 1
            flag = 1
        else:
            p_out = p_out + 1
            flag = 0
        print("flag:{0}, diff:{1}, real:{2}, low:{3}, high:{4}, width:{5}".
              format(flag, point - total_ctr, total_ctr, low, high,
                     high - low))

        if i % 50 == 0 or i == 999:
            print("30w,50bucket,not cover:{0},cover:{1}".format(p_out, p_in))
            count = 0
            for i in sample_0928.exp_pv_sum.values:
                print i
                count += 1
                if count == 20:
                    break

    print("end")
예제 #27
0
def build_center_predictor(epoch=50):
    from keras.utils.generic_utils import get_custom_objects
    get_custom_objects().update(
        {"euclidean_distance_loss": euclidean_distance_loss})
    train_new = False
    try:
        if train_new:
            m = multi_filter_cnn()
        else:
            m = load_model('c_center.h5')
    except Exception as e:
        print(e)
        m = multi_filter_cnn()
    m.compile(optimizer='adam', loss=euclidean_distance_loss, metrics=['MAE'])
    return_original = True
    from task_env import get_samples
    buffer_size = 50
    bs_buffer = []
    while epoch:
        np.random.seed(None)
        X = []
        X_prime = []
        Y = []
        for obj in get_samples(5000,
                               norm=False,
                               return_original=return_original,
                               noise_lvl=2):
            x, y = obj
            if return_original:
                x, x_prime = x
                x_prime = np.expand_dims(x_prime, -1)
                X_prime.append(x_prime)
            x = np.expand_dims(x, -1)
            X.append(x)
            Y.append(y[:2])
        Y = np.array(Y)
        X_prime = np.array(X_prime)
        # print(np.average(X_prime), np.average(Y))
        history = m.fit([
            X_prime,
            X_prime,
        ],
                        Y,
                        epochs=1,
                        validation_split=0.1,
                        batch_size=32,
                        shuffle=True,
                        verbose=1)
        m.save('c_center.h5')
        for i in range(len(history.history['val_loss'])):
            bs_buffer.insert(0, history.history['val_loss'][i])
            while len(bs_buffer) > buffer_size:
                bs_buffer.pop(-1)
        bs_ret = bs.bootstrap(np.array(bs_buffer), stat_func=bs_stats.mean)
        print(bs_ret)
        epoch -= 1
예제 #28
0
def getBootstrapGHS(beta1, beta2, density, bootstrapSampleSize, weightType):
    ghsResultList = []
    for i in range(bootstrapSampleSize):
        bca, bcl, ghs = ghs2.ghs2(beta1.getDistribution(density),
                                  beta2.getDistribution(density),
                                  weightType,
                                  onlyGHS=False)
        ghsResultList.append(ghs)
    result = bs.bootstrap(numpy.array(ghsResultList), stat_func=bs_stats.mean)
    return result
예제 #29
0
def estimator_bootstrap(err, custom_stat=None, alpha=0.05, n_iter=10000):
    """
      def custom_stat(values, axis=1):
      # stat_val = np.mean(np.asmatrix(values),axis=axis)
      # stat_val = np.std(np.asmatrix(values),axis=axis)p.mean
      stat_val = np.sqrt(np.mean(np.asmatrix(values*values),axis=axis))
      return stat_val
    """
    import bootstrapped.bootstrap as bs
    res = bs.bootstrap(err, stat_func=custom_stat, alpha=alpha, num_iterations=n_iter)
    return res
예제 #30
0
def bootstrap_KG(subproblem, attributes, sample_solutions):
    """Evaluates bootstrap for KG samples
   Parameters
   ----------
   ...
   
   Returns
   ----------
   KGmean : float
      Mean, as obtained from boostrap methd
   KGstd : float
      Std, as obtained from boostrap methd
   """
    # Get the KG samples
    #KG_sample_sol = np.array(subproblem['KG_sample_sol'])
    KG_sample_sol = np.array(sample_solutions)

    b = attributes['resamples']
    alpha = attributes['confidence']

    # Use bootstrap to approximate mean
    boost_mean_dist = bs.bootstrap(values=KG_sample_sol,
                                   stat_func=bs_stats.mean,
                                   alpha=alpha,
                                   num_iterations=b,
                                   iteration_batch_size=None,
                                   is_pivotal=True,
                                   num_threads=1,
                                   return_distribution=True)

    # Use bootstrap to approximate std
    boost_std_dist = bs.bootstrap(values=KG_sample_sol,
                                  stat_func=bs_stats.std,
                                  alpha=alpha,
                                  num_iterations=b,
                                  iteration_batch_size=None,
                                  is_pivotal=True,
                                  num_threads=1,
                                  return_distribution=True)

    return boost_mean_dist.mean(), boost_std_dist.mean()