def test_xgb(self): sample_size_cohort = np.int( np.floor(len(self.test_data_cohort) * 4 / 5)) sample_size_control = np.int( np.floor(len(self.test_data_control) * 4 / 5)) auc = [] auprc = [] for i in range(self.boost_iteration): test_cohort = resample(self.test_data_cohort, n_samples=sample_size_cohort) test_control = resample(self.test_data_control, n_samples=sample_size_control) self.aquire_batch_data_cohort(0, test_cohort, len(test_cohort)) self.aquire_batch_data_control(0, test_control, len(test_control)) self.aquire_batch_data_whole() # print(self.lr.score(self.one_batch_data,self.one_batch_logit)) auc.append( roc_auc_score( self.one_batch_logit_whole, self.xg_model.predict_proba(self.one_batch_data_whole)[:, 1])) auprc.append( average_precision_score( self.one_batch_logit_whole, self.xg_model.predict_proba(self.one_batch_data_whole)[:, 1])) print("auc") print(bs.bootstrap(np.array(auc), stat_func=bs_stats.mean)) print("auprc") print(bs.bootstrap(np.array(auprc), stat_func=bs_stats.mean))
def test_pivotal(self): mean = 100 stdev = 10 test = np.random.normal(loc=mean, scale=stdev, size=500) ctrl = np.random.normal(loc=mean, scale=stdev, size=5000) test = test * 1.1 bsr = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change) bsr_percent = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change, is_pivotal=False) self.assertAlmostEqual(bsr.value, bsr_percent.value, delta=.1) self.assertAlmostEqual(bsr.lower_bound, bsr_percent.lower_bound, delta=.1) self.assertAlmostEqual(bsr.upper_bound, bsr_percent.upper_bound, delta=.1) bsr = bs.bootstrap(test, bs_stats.mean) bsr_percent = bs.bootstrap(test, bs_stats.mean, num_threads=10) self.assertAlmostEqual(bsr.value, bsr_percent.value, delta=.1) self.assertAlmostEqual(bsr.lower_bound, bsr_percent.lower_bound, delta=.1) self.assertAlmostEqual(bsr.upper_bound, bsr_percent.upper_bound, delta=.1)
def eval_stats(exp, pred, N): error = np.abs(exp - pred) rmse_bootstrap_dist = bs.bootstrap(np.reshape(error, (N, -1)), stat_func=RMSE_function, num_iterations=1000, alpha=0.05, is_pivotal=True, return_distribution=True) rmse = np.sqrt(np.mean(error**2)) tau = scipy.stats.kendalltau(exp, pred)[0] tau_bootstrap_dist = bs.bootstrap(np.reshape( np.array(list(zip(exp, pred))), (N, -1)), stat_func=tau_function, num_iterations=1000, alpha=0.05, is_pivotal=True, return_distribution=True) tmp1 = [ rmse, np.percentile(rmse_bootstrap_dist, 2.5), np.percentile(rmse_bootstrap_dist, 97.5), tau, np.percentile(tau_bootstrap_dist, 2.5), np.percentile(tau_bootstrap_dist, 97.5) ] return tmp1
def test(self): sample_size_cohort = np.int( np.floor(len(self.test_data_cohort) * 4 / 5)) sample_size_control = np.int( np.floor(len(self.test_data_control) * 4 / 5)) auc = [] auprc = [] for i in range(self.boost_iteration): print(i) test_cohort = resample(self.test_data_cohort, n_samples=sample_size_cohort) test_control = resample(self.test_data_control, n_samples=sample_size_control) test_data = test_cohort + test_control logit_test = np.zeros(len(test_cohort) + len(test_control)) logit_test[0:len(test_cohort)] = 1 self.aquire_batch_data(0, test_data, len(test_data), logit_test) # print(self.lr.score(self.one_batch_data,self.one_batch_logit)) self.out_logit = self.sess.run( self.logit_sig, feed_dict={self.input_x: self.one_batch_data}) #self.init_hiddenstate: init_hidden_state}) #self.input_x_static: self.one_batch_data_static}) auc.append(roc_auc_score(self.one_batch_logit, self.out_logit)) auprc.append( average_precision_score(self.one_batch_logit, self.out_logit)) print("auc") print(bs.bootstrap(np.array(auc), stat_func=bs_stats.mean)) print("auprc") print(bs.bootstrap(np.array(auprc), stat_func=bs_stats.mean))
def test_bootstrap_ratio(self): denom = np.array(([10] * 100) + ([1 / 10.] * 100)) samples = np.array((([1 / 10.] * 100) + [10] * 100)) bsr = bs.bootstrap(samples, bs_stats.mean, denominator_values=denom) self.assertAlmostEqual(bsr.value, 1, delta=.1) bsr = bs.bootstrap(samples / denom, bs_stats.mean) self.assertAlmostEqual(bsr.value, 50, delta=5)
def calculate_real_gain(real_inputs, real_outputs, sampling_size, input_distance_type, output_distance_type, model): gains = [] while len(gains) < sampling_size: subsampling_idxs = random.sample( range(len(real_outputs)), min(sampling_size * 2, len(real_outputs)) ) #numpy.random.randint(0, len(outputs), (args.sampling_size*2,)) if len(subsampling_idxs) % 2 != 0: subsampling_idxs = subsampling_idxs[1:] batch1 = subsampling_idxs[:int(len(subsampling_idxs) / 2)] batch2 = subsampling_idxs[int(len(subsampling_idxs) / 2):] inputs1 = [real_inputs[i] for i in batch1] inputs2 = [real_inputs[i] for i in batch2] outputs1 = [real_outputs[i] for i in batch1] outputs2 = [real_outputs[i] for i in batch2] if input_distance_type == "infersent-cosine": inputs1 = model.encode(inputs1, bsize=128, tokenize=False, verbose=True) inputs2 = model.encode(inputs2, bsize=128, tokenize=False, verbose=True) if output_distance_type == "infersent-cosine": outputs1 = model.encode(outputs1, bsize=128, tokenize=False, verbose=True) outputs2 = model.encode(outputs2, bsize=128, tokenize=False, verbose=True) for in1, in2, out1, out2 in zip(inputs1, inputs2, outputs1, outputs2): input_distance = distance(in1, in2, distance_type=input_distance_type) output_distance = distance(out1, out2, distance_type=output_distance_type) gain = output_distance / (input_distance + EPS) gains.append(gain) gains = np.array(gains) # calculate bootstrap estimates for the mean and standard deviation# calcu mean_results = bs.bootstrap(gains, stat_func=bs_stats.mean) # see advanced_bootstrap_features.ipynb for a discussion of how to use the stat_func arg stdev_results = bs.bootstrap(gains, stat_func=bs_stats.std) return mean_results, stdev_results
def dist_plot(imgt, imgc, trt_name, ctr_name, meas, qname, unit, xdelta, binstep): if (unit != ""): unit = " [" + unit + "]" h = np.linspace( min(np.amin(imgt[meas + "_true"]), np.amin(imgc[meas + "_true"])), max(np.amax(imgt[meas + "_true"]), np.amax(imgc[meas + "_true"])), 11) print("Treated vs control for ", meas, ":") print(">:", np.sum(imgt[meas + "_est"].values > imgc[meas + "_est"].values)) print("<:", np.sum(imgt[meas + "_est"].values < imgc[meas + "_est"].values)) sw, pval = spst.wilcoxon(imgt[meas + "_err"], imgc[meas + "_err"]) print("Pairwise difference:", trt_name, "-", ctr_name, "for", meas, unit) print('Wilcoxon (t, pval): %.3lf, %.5lf' % (sw, pval)) print("Treated:", np.mean(imgt[meas + "_est"]), unit, "vs untreated:", np.mean(imgc[meas + "_est"]), unit) print("Effect strength:", bs.bootstrap(imgt[meas+"_est"].values - imgc[meas+"_est"].values,\ stat_func=bs_stats.mean, alpha=0.05, num_iterations=10000)) imgt[meas + "_rel_diff"] = 2 * ( imgt[meas + "_est"].values - imgc[meas + "_est"].values) / ( imgt[meas + "_true"].values + imgc[meas + "_true"].values) sw, pval = spst.wilcoxon(imgt[meas + "_rel_diff"]) print("\nRelative pairwise difference:", trt_name, "-", ctr_name, "for", meas, '%') print('Wilcoxon (t, pval): %.3lf, %.5lf' % (sw, pval)) print( "Effect strength:", bs.bootstrap(imgt[meas + "_rel_diff"].values, stat_func=bs_stats.mean, alpha=0.05, num_iterations=10000)) constant_bins = range(-xdelta, xdelta, binstep) sns.distplot(imgt[meas + "_est"].values - imgc[meas + "_est"].values, bins=constant_bins, color=get_meas_color(meas)[0]) plt.axvline(x=0, color="black", linewidth='1.0', linestyle="dashed") plt.xlim(-xdelta, xdelta) plt.xlabel("Estimated within-pair %s diff.%s: %s minus %s" % (qname, unit, trt_name, ctr_name), fontsize=fslegend) plt.ylabel('Relative frequency', fontsize=fslegend) plt.xticks(fontsize=fsticks) plt.yticks(fontsize=fsticks) plt.legend(fontsize=fslegend) plt.tight_layout() plt.savefig('Plots/' + 'plot_dist_' + meas + '_pairwise_est_clean_' + trt_name + '_' + ctr_name + '.pdf') plt.show()
def confidence_intervals(self, do_new_class_ci=False, alpha=0.05): # calculate bootstrap estimates for the mean and standard deviation ci_obj = bs.bootstrap(np.array(self.trial_results), stat_func=bs_stats.mean, alpha=alpha) m = (ci_obj.value, ci_obj.lower_bound, ci_obj.upper_bound) if do_new_class_ci: nc_obj = bs.bootstrap(np.array(self.new_class_results), stat_func=bs_stats.mean, alpha=alpha) nc = (nc_obj.value, nc_obj.lower_bound, nc_obj.upper_bound) else: nc = (0, 0, 0) return m, nc
def apply_bootstrap(self, data): ''' ToDo: data_clear ''' data_clear = data.strip() bs.bootstrap(data_clear[(data_clear['VARIANT_NAME'] == 'control' )].groupby('USER_ID').action.sum().values, stat_func=bs_stats.mean, num_iterations=10000, iteration_batch_size=300, return_distribution=True) return list()
def test_bootstrap_batch_size(self): mean = 100 stdev = 10 test = np.random.normal(loc=mean, scale=stdev, size=500) ctrl = np.random.normal(loc=mean, scale=stdev, size=5000) test = test * 1.1 bsr = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change) bsr_batch = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change, iteration_batch_size=10) self.assertAlmostEqual( bsr.value, bsr_batch.value, delta=.1 ) self.assertAlmostEqual( bsr.lower_bound, bsr_batch.lower_bound, delta=.1 ) self.assertAlmostEqual( bsr.upper_bound, bsr_batch.upper_bound, delta=.1 ) bsr = bs.bootstrap(test, bs_stats.mean) bsr_batch = bs.bootstrap(test, bs_stats.mean, iteration_batch_size=10) self.assertAlmostEqual( bsr.value, bsr_batch.value, delta=.1 ) self.assertAlmostEqual( bsr.lower_bound, bsr_batch.lower_bound, delta=.1 ) self.assertAlmostEqual( bsr.upper_bound, bsr_batch.upper_bound, delta=.1 )
def getBootstrapHellKl(beta1, beta2, density, bootstrapSampleSize): resultListKL = [] resultListHell = [] for i in range(bootstrapSampleSize): resultListHell.append( distanceMetrics.hellinger1(beta1.getDistribution(density), beta2.getDistribution(density))) resultListKL.append( distanceMetrics.dkl(beta1.getDistribution(density), beta2.getDistribution(density))) rBKL = bs.bootstrap(numpy.array(resultListKL), stat_func=bs_stats.mean) rBHell = bs.bootstrap(numpy.array(resultListHell), stat_func=bs_stats.mean) return rBKL, rBHell
def generate_rastrigin_statistics(pop_size, runs=30, n=10): m = Rastrigin(n) final_best_objective = [] final_best_sol = [] test_best_fitness = [] test_mean_fitness = [] final_mean_fitness = [] for i in range(runs): ga_instance = GA([-5.12] * n, [5.12] * n, m.f, pop_size=pop_size, num_bits=20) ga_instance.run() ga_instance.save_results(i) mean_fitness = [np.mean(v) for v in ga_instance.generation_fitness] best_fitness = [np.max(v) for v in ga_instance.generation_fitness] test_best_fitness.append(best_fitness) test_mean_fitness.append(mean_fitness) final_best_objective.append(ga_instance.best_objective) final_best_sol.append(ga_instance.best_solution) final_mean_fitness.append( ga_instance.descale(np.mean(ga_instance.population_fitness))) print('BEST SOL: {}'.format(ga_instance.best_solution)) print('BEST FOBJ: {}'.format(ga_instance.best_objective)) print('=================================================') bs_best_fitness = bs.bootstrap(np.array(final_best_objective), stat_func=bs_stats.mean) bs_mean_fitness = bs.bootstrap(np.array(final_mean_fitness), stat_func=bs_stats.mean) # print(statistics.describe()) print('Melhor solução final: {} CI 95% ({}, {})'.format( bs_best_fitness.value, bs_best_fitness.lower_bound, bs_best_fitness.upper_bound)) print('Melhor solução MÉDIA final: {} CI 95% ({}, {})'.format( bs_mean_fitness.value, bs_mean_fitness.lower_bound, bs_mean_fitness.upper_bound)) return test_best_fitness, test_mean_fitness, bs_best_fitness, bs_mean_fitness, final_best_sol
def main(): size = INPUT_SHAPE[0] num_trials = 30 samples = [] for _ in range(num_trials): results = [] for _ in range(100): params, img = noisy_circle(size, RADIUS, 2) params = list(params) detected_center = center_predictor.predict([ np.array([np.expand_dims(img, -1)]), np.array([np.expand_dims(img, -1)]), ])[0] detected_radius = radius_predictor.predict([ np.array([np.expand_dims(img, -1)]), np.array([np.expand_dims(img, -1)]), ])[0] detected = [ detected_center.tolist()[0], detected_center.tolist()[1] ] + detected_radius.tolist() ret = iou(params, detected) results.append(ret) results = np.array(results) precision = (results > 0.7).mean() samples.append(precision) samples = np.array(samples) bs_ret = bs.bootstrap(samples, stat_func=bs_stats.mean, alpha=0.05) print(bs_ret)
def main(): """The main function.""" args = parse_args() plot_data_paths = args.plot_data num_plot_datas = len(plot_data_paths) aucs = [] for plot_data_path in plot_data_paths: if plot_data_path.stat().st_size == 0: continue with plot_data_path.open() as inf: df = read_plot_data(inf) if df.empty: continue df['unix_time'] = df.unix_time - df.unix_time.iloc[0] total_cov = df.map_size.iloc[-1] percentile_cov = total_cov * args.percentile df_percentile = df[df.map_size <= percentile_cov] if len(df_percentile) < 2: df_percentile = df[0:2] auc = metrics.auc(df_percentile.unix_time, df_percentile.map_size) aucs.append(auc) # Compute the mean AUC and confidence intervals auc_ci = bs.bootstrap(np.array(aucs), stat_func=bs_stats.mean) print(f'mean AUC ({num_plot_datas} plot_data files)') print(f' {auc_ci.value:.02f} +/- {auc_ci.error_width() / 2:.02f}')
def avg_velocity_from_k(episodes, k = 0.0): avg_vel = [] for e in episodes: steps = v[v['episode'] == e][['x', 'y']] steps = steps[steps['x'] >= k] if steps.shape[0] < 200: continue last, first = steps.iloc[-1], steps.iloc[0] avg_velocity = (last['x'] - first['x'])/((last.name - first.name) * STEP_TIME) avg_vel.append(avg_velocity) bs_mean_step = bs.bootstrap(np.array(avg_vel), stat_func=bs_stats.mean, alpha=0.05) bs_std_step = bs.bootstrap(np.array(avg_vel), stat_func=bs_stats.std, alpha=0.05) return bs_mean_step.value, bs_std_step.value, bs_mean_step.upper_bound, bs_mean_step.lower_bound, np.max(avg_vel), np.min(avg_vel)
def calculate_metrics(results): import bootstrapped.bootstrap as bs import bootstrapped.stats_functions as bs_stats stat_dict = {} for s in tqdm(results): for t in results[s]: for p in results[s][t]: if not p in stat_dict: stat_dict[p] = {} opts = np.array([result.fun for result in results[s][t][p]]) stats = bs.bootstrap(opts, stat_func=bs_stats.mean, num_iterations=1000000, iteration_batch_size=100000, num_threads=-1) l, m, u = stats.lower_bound, stats.value, stats.upper_bound stat_dict[p][s] = "%s<%s<%s" % tuple( round(v, 3) for v in (l, m, u)) from pandas import DataFrame # https://stackoverflow.com/questions/19258772/write-2d-dictionary-into-a-dataframe-or-tab-delimited-file-using-python df = DataFrame(stat_dict, index=list(results.keys())) df = df.T return df
def bootstrap(dataset: Dataset, combined_data: CombinedData): calculations = {} xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() for y in ys: # for now assert (len(ys) == 1) # Main effects for x in xs: cat = [k for k, v in x.metadata[categories].items()] for c in cat: cat_data = dataset.select( y.metadata[name], where=[f"{x.metadata[name]} == '{c}'"]) stat = bs.bootstrap(cat_data.to_numpy(), stat_func=bs_stats.median) calculations[c] = stat # import pdb; pdb.set_trace() # store all the medians & confidence intervals # return all the medians & CIs # data.append(cat_data) return calculations
def generate_rastrigin_statistics(pop_size, mutation_probability, crossover_probability, runs=30, n=10): m = Rastrigin(n) final_best_fitness = [] final_best_sol = [] test_best_fitness = [] test_mean_fitness = [] for _ in range(runs): ga_instance = GA([-5.12]*n, [5.12]*n, m.f, num_generations=10000, mutation_probability=mutation_probability, pop_size=pop_size, crossover_probability=crossover_probability) ga_instance.run() mean_fitness = [np.mean(v) for v in ga_instance.generation_fitness] best_fitness = [np.max(v) for v in ga_instance.generation_fitness] test_best_fitness.append(best_fitness) test_mean_fitness.append(mean_fitness) final_best_fitness.append(ga_instance.best_objective) final_best_sol.append(ga_instance.best_objective) # Generate statistics table statistics = pd.DataFrame() statistics['Melhor solução final'] = np.array(final_best_fitness) bs_best_fitness = bs.bootstrap( np.array(final_best_fitness), stat_func=bs_stats.mean) # print(statistics.describe()) print('Melhor solução final: {} CI 95% ({}, {})'.format(bs_best_fitness.value, bs_best_fitness.lower_bound, bs_best_fitness.upper_bound)) return test_best_fitness, test_mean_fitness, bs_best_fitness, final_best_sol, statistics
def QEr_Qboot(bindf, bins=[5, 10, 20, 30, 40, 50, 70, 150], silent=False): qbootsigs = np.zeros((np.shape(bins)[0] - 1, )) qbootsigerrsu = np.zeros((np.shape(bins)[0] - 1, )) qbootsigerrsl = np.zeros((np.shape(bins)[0] - 1, )) for i, Qv in enumerate(bindf): if not silent: print(np.shape(Qv)) Qv = np.asarray(Qv) #print(Qv[0:10]) try: bsr = bs.bootstrap(Qv, stat_func=bs_stats.std, iteration_batch_size=100) except MemoryError as e: print('There was a memory error - too much memory to be allocated') if not silent: print(bsr) qbootsigs[i] = np.std(Qv) qbootsigerrsu[i] = bsr.upper_bound qbootsigerrsl[i] = bsr.lower_bound #change over to size of error bars, not confidence interval qbootsigerrsu = qbootsigerrsu - qbootsigs qbootsigerrsl = -qbootsigerrsl + qbootsigs return qbootsigs, qbootsigerrsl, qbootsigerrsu
def learn_similar_car_from_videos(num_instances=10, fps=24, learn_new=False, percentile=5, max_samples_per_clip=30, max_frame=600): ''' Learn similarity distribution from continuous frames that both contains cars :param num_instances: number of positive instances need to observe :param fps: fps used in video indexing :param interval: interval for compare continuous images :return: dictionary contains normal distribution mean and std ''' this_dist_path = os.path.join(MODEL_DIR, TEST_SIGNIFICANCE_PERCENTILE) if os.path.exists(this_dist_path) and (not learn_new): with open(this_dist_path, 'r') as f: car_sim_dist = json.load(f) return car_sim_dist else: fps = fps n = num_instances ret = watch_n_random_videos(n, fps=fps, max_samples_per_clip=max_samples_per_clip, max_frame=max_frame) ret = np.array(list(ret)) print(ret) bt_ret = bs.bootstrap(ret, stat_func=bs_stats.mean, alpha=percentile/100) print(dir(bt_ret)) ci = (bt_ret.lower_bound, bt_ret.upper_bound) mean = bt_ret.value l_percentile, r_percentile = ci car_sim_dist = { 'l_percentile': float(l_percentile), 'r_percentile': float(r_percentile), 'mean': float(mean), } print(car_sim_dist) with open(this_dist_path, 'w+') as f: json.dump(car_sim_dist, f) return car_sim_dist
def plot_kl(datas, interval='t'): fig, ax = plt.subplots(1, 1, figsize=set_size(width)) for data in datas: x = np.linspace(0, 1000, data.shape[1]) n = data.shape[0] if interval == 't': means = data.mean(axis=0) se = stats.sem(data, axis=0) low, high = stats.t.interval(0.95, n - 1, loc=means, scale=se) elif interval == 'bs': means = np.zeros(data.shape[1]) low = np.zeros(data.shape[1]) high = np.zeros(data.shape[1]) for i in range(data.shape[1]): temp = bs.bootstrap(data[:, i], stat_func=bs_stats.mean, alpha=0.05, is_pivotal=False) means[i] = temp.value low[i] = temp.lower_bound high[i] = temp.upper_bound ax.plot(x, means) # ax.fill_between(x, low, high, alpha=0.2) ax.set_ylim([0, 0.05])
def build_radius_predictor(epoch=50): train_new = False try: if train_new: m = multi_filter_cnn(output_dim=1) else: m = load_model('c_radius.h5') except Exception as e: print(e) m = multi_filter_cnn(output_dim=1) m.compile(optimizer='adam', loss='MSE', metrics=['MAE']) return_original = True from task_env import get_samples buffer_size = 50 bs_buffer = [] while epoch: np.random.seed(None) X = [] X_prime = [] Y = [] for obj in get_samples(5000, norm=False, return_original=return_original, noise_lvl=2): x, y = obj if return_original: x, x_prime = x x_prime = np.expand_dims(x_prime, -1) X_prime.append(x_prime) x = np.expand_dims(x, -1) X.append(x) Y.append(y[-1:]) X = np.array(X) Y = np.array(Y) X_prime = np.array(X_prime) # print(X.shape, Y.shape, X_prime.shape) # print(np.average(X_prime), np.average(Y)) history = m.fit([ X_prime, X_prime, ], Y, epochs=1, validation_split=0.1, batch_size=32, shuffle=True, verbose=2) m.save('c_radius.h5') for i in range(len(history.history['val_mean_absolute_error'])): bs_buffer.insert(0, history.history['val_mean_absolute_error'][i]) while len(bs_buffer) > buffer_size: bs_buffer.pop(-1) bs_ret = bs.bootstrap(np.array(bs_buffer), stat_func=bs_stats.mean) print(bs_ret) epoch -= 1
def test_bootstrap(self): mean = 100 stdev = 10 samples = np.random.normal(loc=mean, scale=stdev, size=5000) bsr = bs.bootstrap(samples, bs_stats.mean) self.assertAlmostEqual(bsr.value, 100, delta=2) self.assertAlmostEqual(bsr.upper_bound, 102, delta=2) self.assertAlmostEqual(bsr.lower_bound, 98, delta=2) bsr2 = bs.bootstrap(samples, bs_stats.mean, alpha=0.1) self.assertAlmostEqual(bsr.value, bsr2.value, delta=2) self.assertTrue(bsr.upper_bound > bsr2.upper_bound) self.assertTrue(bsr.lower_bound < bsr2.lower_bound)
def compute_stats(est, imgpairs): imgpairs["votes1"], imgpairs["votes2"] = 0.5, 0.5 imgpairs["group"] = "" # CIs based on the per-images vote distribution. for ip, p in imgpairs.iterrows(): cest = est.loc[est.pairname == p.img1 + p.img2] if (cest.shape[0] > 0): imgpairs.loc[ip, "group"] = cest.group.values[0] imgpairs.loc[ip, "votes1"] = cest.loc[ cest.vote == p.img1].shape[0] / cest.shape[0] imgpairs.loc[ip, "votes2"] = cest.loc[ cest.vote == p.img2].shape[0] / cest.shape[0] #estcnt = est.groupby("pairname").apply(lambda x: ) gdv = {} for g in np.unique(imgpairs.group.values): cres = imgpairs.loc[imgpairs.group == g] gdv[g] = bs.bootstrap(cres.votes1.values, stat_func=bs_stats.mean, alpha=0.05, num_iterations=10000) g = np.sort(np.unique(imgpairs.group.values)) if (g[0] == ""): g = g[1:] print("Count of images per group:", imgpairs.groupby("group").apply(lambda x: x.shape[0])) # CIs based on separate resampling of votes for each image. means = collections.defaultdict(list) allgroups = np.unique(imgpairs.group.values) for count in range(10000): if (count % 100 == 0): print(count) cmeans = collections.defaultdict(list) for ip, p in imgpairs.iterrows(): cest = est.loc[est.pairname == p.img1 + p.img2] if (cest.shape[0] == 0): continue cmeans[cest.group.values[0]].append( np.mean(np.random.choice(cest.vote == p.img1, 40, replace=True))) for group in allgroups: means[group].append(np.mean(cmeans[group])) for group in allgroups: pCI = np.percentile(means[group], [0, 95]) print("%s: %.3lf (%.3lf, %.3lf)" % (group, np.mean(imgpairs.loc[imgpairs.group == group, "votes1"]), pCI[0], pCI[1])) sys.exit(1) return imgpairs, g, gdv
def statistical(sample, ss, sz, _alpha): resample = [sample[i] for i in np.random.choice(ss, min(sz, 10000))] bmi_sample = [te(height, weight) for (height, weight) in resample] res = bs.bootstrap(np.array(bmi_sample), stat_func=bs_stats.mean, alpha=_alpha) return (res.lower_bound, res.value, res.upper_bound)
def real_data_test3w(): df = pd.read_csv("1005-ctr.sql", sep='\t') total_ctr = float(np.sum(df["cli_pv"])) / np.sum(df["exp_pv"]) p_out, p_in, flag = 0, 0, 0 z_out, z_in, z_flag = 0, 0, 0 sample_size = 30000 bucket_num = 50 split_num = sample_size / bucket_num num_iterations = 10000 for i in range(0, 1000): print("{0}th test--------------------".format(i)) buck_index = np.floor(np.arange(0, sample_size) / split_num) filename1 = "data/0928A30w_{0}".format(i) if os.path.exists(filename1): sample1 = pd.read_csv(filename1, sep='\t') else: sample1 = df.sample(n=sample_size) sample1["bucket_index"] = buck_index sample1.to_csv(filename1, sep='\t') sample_0928 = sample1.groupby( ['bucket_index'])["cli_pv", "exp_pv"].sum().add_suffix('_sum').reset_index() #####bootstrap####### r = bs.bootstrap(sample_0928.cli_pv_sum.values, bs_stats.mean, denominator_values=sample_0928.exp_pv_sum.values) point, low, high = r.value, r.lower_bound, r.upper_bound if total_ctr >= low and total_ctr <= high: p_in = p_in + 1 flag = 1 else: p_out = p_out + 1 flag = 0 print("flag:{0}, diff:{1}, real:{2}, low:{3}, high:{4}, width:{5}". format(flag, point - total_ctr, total_ctr, low, high, high - low)) if i % 50 == 0 or i == 999: print("30w,50bucket,not cover:{0},cover:{1}".format(p_out, p_in)) count = 0 for i in sample_0928.exp_pv_sum.values: print i count += 1 if count == 20: break print("end")
def build_center_predictor(epoch=50): from keras.utils.generic_utils import get_custom_objects get_custom_objects().update( {"euclidean_distance_loss": euclidean_distance_loss}) train_new = False try: if train_new: m = multi_filter_cnn() else: m = load_model('c_center.h5') except Exception as e: print(e) m = multi_filter_cnn() m.compile(optimizer='adam', loss=euclidean_distance_loss, metrics=['MAE']) return_original = True from task_env import get_samples buffer_size = 50 bs_buffer = [] while epoch: np.random.seed(None) X = [] X_prime = [] Y = [] for obj in get_samples(5000, norm=False, return_original=return_original, noise_lvl=2): x, y = obj if return_original: x, x_prime = x x_prime = np.expand_dims(x_prime, -1) X_prime.append(x_prime) x = np.expand_dims(x, -1) X.append(x) Y.append(y[:2]) Y = np.array(Y) X_prime = np.array(X_prime) # print(np.average(X_prime), np.average(Y)) history = m.fit([ X_prime, X_prime, ], Y, epochs=1, validation_split=0.1, batch_size=32, shuffle=True, verbose=1) m.save('c_center.h5') for i in range(len(history.history['val_loss'])): bs_buffer.insert(0, history.history['val_loss'][i]) while len(bs_buffer) > buffer_size: bs_buffer.pop(-1) bs_ret = bs.bootstrap(np.array(bs_buffer), stat_func=bs_stats.mean) print(bs_ret) epoch -= 1
def getBootstrapGHS(beta1, beta2, density, bootstrapSampleSize, weightType): ghsResultList = [] for i in range(bootstrapSampleSize): bca, bcl, ghs = ghs2.ghs2(beta1.getDistribution(density), beta2.getDistribution(density), weightType, onlyGHS=False) ghsResultList.append(ghs) result = bs.bootstrap(numpy.array(ghsResultList), stat_func=bs_stats.mean) return result
def estimator_bootstrap(err, custom_stat=None, alpha=0.05, n_iter=10000): """ def custom_stat(values, axis=1): # stat_val = np.mean(np.asmatrix(values),axis=axis) # stat_val = np.std(np.asmatrix(values),axis=axis)p.mean stat_val = np.sqrt(np.mean(np.asmatrix(values*values),axis=axis)) return stat_val """ import bootstrapped.bootstrap as bs res = bs.bootstrap(err, stat_func=custom_stat, alpha=alpha, num_iterations=n_iter) return res
def bootstrap_KG(subproblem, attributes, sample_solutions): """Evaluates bootstrap for KG samples Parameters ---------- ... Returns ---------- KGmean : float Mean, as obtained from boostrap methd KGstd : float Std, as obtained from boostrap methd """ # Get the KG samples #KG_sample_sol = np.array(subproblem['KG_sample_sol']) KG_sample_sol = np.array(sample_solutions) b = attributes['resamples'] alpha = attributes['confidence'] # Use bootstrap to approximate mean boost_mean_dist = bs.bootstrap(values=KG_sample_sol, stat_func=bs_stats.mean, alpha=alpha, num_iterations=b, iteration_batch_size=None, is_pivotal=True, num_threads=1, return_distribution=True) # Use bootstrap to approximate std boost_std_dist = bs.bootstrap(values=KG_sample_sol, stat_func=bs_stats.std, alpha=alpha, num_iterations=b, iteration_batch_size=None, is_pivotal=True, num_threads=1, return_distribution=True) return boost_mean_dist.mean(), boost_std_dist.mean()