def test_pivotal(self): mean = 100 stdev = 10 test = np.random.normal(loc=mean, scale=stdev, size=500) ctrl = np.random.normal(loc=mean, scale=stdev, size=5000) test = test * 1.1 bsr = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change) bsr_percent = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change, is_pivotal=False) self.assertAlmostEqual(bsr.value, bsr_percent.value, delta=.1) self.assertAlmostEqual(bsr.lower_bound, bsr_percent.lower_bound, delta=.1) self.assertAlmostEqual(bsr.upper_bound, bsr_percent.upper_bound, delta=.1) bsr = bs.bootstrap(test, bs_stats.mean) bsr_percent = bs.bootstrap(test, bs_stats.mean, num_threads=10) self.assertAlmostEqual(bsr.value, bsr_percent.value, delta=.1) self.assertAlmostEqual(bsr.lower_bound, bsr_percent.lower_bound, delta=.1) self.assertAlmostEqual(bsr.upper_bound, bsr_percent.upper_bound, delta=.1)
def test_bootstrap_ab(self): mean = 100 stdev = 10 test = np.random.normal(loc=mean, scale=stdev, size=500) ctrl = np.random.normal(loc=mean, scale=stdev, size=5000) test = test * 1.1 bsr = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change) self.assertAlmostEqual(bsr.value, 10, delta=.5) bsr2 = bs.bootstrap_ab(test, ctrl, bs_stats.sum, bs_compare.percent_change) self.assertAlmostEqual(bsr2.value, -88, delta=2) bsr3 = bs.bootstrap_ab(test, ctrl, bs_stats.sum, bs_compare.percent_change, scale_test_by=10.) self.assertAlmostEqual(bsr3.value, 10, delta=.5) test_denom = np.random.normal(loc=mean, scale=stdev, size=500) ctrl_denom = np.random.normal(loc=mean, scale=stdev, size=5000) test_denom = test_denom * 1.1 bsr4 = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change, test_denominator=test_denom, ctrl_denominator=ctrl_denom) self.assertAlmostEqual(bsr4.value, 0, delta=.5)
def test_bootstrap_ab_sparse(self): mean = 100 stdev = 10 test = np.random.normal(loc=mean, scale=stdev, size=500) ctrl = np.random.normal(loc=mean, scale=stdev, size=5000) test = test * 1.1 test_sp = sparse.csr_matrix(test) ctrl_sp = sparse.csr_matrix(ctrl) bsr = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change) bsr_sp = bs.bootstrap_ab(test_sp, ctrl_sp, bs_stats.mean, bs_compare.percent_change) self.assertAlmostEqual( bsr.value, bsr_sp.value, delta=.1, ) self.assertAlmostEqual( bsr.upper_bound, bsr_sp.upper_bound, delta=.1, ) self.assertAlmostEqual( bsr.lower_bound, bsr_sp.lower_bound, delta=.1, )
def compute_stat_from_eval_perfs(data_path, n1, n2): eval_perfs = np.empty([n1 + n2, 251]) * np.nan scores_our = [] for i, f in enumerate(sorted(os.listdir(data_path))): if 'our' in f: scores = np.loadtxt(data_path + f) for j in range(scores.shape[0]): scores_our.append(scores[j]) else: if '2M' in f: tmp = np.concatenate( [np.zeros([20, 1]), np.loadtxt(data_path + f)], axis=1) else: tmp = np.loadtxt(data_path + f) if tmp.shape[1] == 252: tmp = tmp[:, :-1] eval_perfs[i * 20:(i + 1) * n1, :] = tmp # compute statistics data1_litt = np.nanmean(eval_perfs[:n1][:, -10:], axis=1) data2_litt = np.nanmean(eval_perfs[n1:][:, -10:], axis=1) ks_litt, p_ks_litt = ks_2samp(data1_litt, data2_litt) ttest_litt, p_ttest_litt = ttest_ind(data1_litt, data2_litt, equal_var=False) data1_our = np.array(scores_our[:n1]) data2_our = np.array(scores_our[n1:]) ks_our, p_ks_our = ks_2samp(data1_our, data2_our) ttest_our, p_ttest_our = ttest_ind(data1_our, data2_our, equal_var=False) # estimation of confidence intervals with bootstrap method, https://github.com/facebookincubator/bootstrapped res_litt = bs.bootstrap_ab(data1_litt, data2_litt, bs_stats.mean, bs_compare.difference, num_iterations=10000) sign_litt = np.sign(res_litt.upper_bound) == np.sign(res_litt.lower_bound) res_our = bs.bootstrap_ab(data1_our, data2_our, bs_stats.mean, bs_compare.difference, num_iterations=10000) sign_our = np.sign(res_our.upper_bound) == np.sign(res_our.lower_bound) toSave = np.zeros([4, 4]) toSave[0:2, :] = np.array([[ks_litt, p_ks_litt, ttest_litt, p_ttest_litt], [ks_our, p_ks_our, ttest_our, p_ttest_our]]) toSave[2, :] = np.array([ res_litt.value, res_litt.lower_bound, res_litt.upper_bound, sign_litt * np.sign(res_litt.lower_bound) ]) toSave[3, :] = np.array([ res_our.value, res_our.lower_bound, res_our.upper_bound, sign_our * np.sign(res_our.lower_bound) ]) np.savetxt(data_path + 'stats', toSave)
def test_bootstrap_batch_size(self): mean = 100 stdev = 10 test = np.random.normal(loc=mean, scale=stdev, size=500) ctrl = np.random.normal(loc=mean, scale=stdev, size=5000) test = test * 1.1 bsr = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change) bsr_batch = bs.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change, iteration_batch_size=10) self.assertAlmostEqual( bsr.value, bsr_batch.value, delta=.1 ) self.assertAlmostEqual( bsr.lower_bound, bsr_batch.lower_bound, delta=.1 ) self.assertAlmostEqual( bsr.upper_bound, bsr_batch.upper_bound, delta=.1 ) bsr = bs.bootstrap(test, bs_stats.mean) bsr_batch = bs.bootstrap(test, bs_stats.mean, iteration_batch_size=10) self.assertAlmostEqual( bsr.value, bsr_batch.value, delta=.1 ) self.assertAlmostEqual( bsr.lower_bound, bsr_batch.lower_bound, delta=.1 ) self.assertAlmostEqual( bsr.upper_bound, bsr_batch.upper_bound, delta=.1 )
def compare_stats(data_path, n1, n2): """ Computes statistical tests to assess whether two algorithms perform statistically differently. data_path should include the scores_absolute and scores_final of the two algorithms as such: 1_scores_final_algo1, 2_scores_final_aglo2, 3_score_absolute_algo1, scores_absolute_algo2 These files are created by the ''compute_plot_all' function. We compute Kolmogorov Smirnov test, the ttest, and a bootstrap ocnfidence interval of the difference in performance for the absolute and final metrics. """ print('Running statistical tests..') eval_perfs = np.empty([n1+n2,1001])*np.nan # 1001 is the length of an episode scores_absolute = [] scores_final = [] for i, f in enumerate(sorted(os.listdir(data_path))): # print(f) if 'absolute' in f: scores = np.loadtxt(data_path+f) for j in range(scores.shape[0]): scores_absolute.append(scores[j]) if 'final' in f: scores = np.loadtxt(data_path+f) for j in range(scores.shape[0]): scores_final.append(scores[j]) data1_absolute = np.array(scores_absolute[:n1]) data2_absolute = np.array(scores_absolute[n1:]) data1_final = np.array(scores_final[:n1]) data2_final = np.array(scores_final[n1:]) ks_final, p_ks_final = ks_2samp(data1_final,data2_final) ttest_final, p_ttest_final = ttest_ind(data1_final,data2_final, equal_var=False) ks_absolute, p_ks_absolute = ks_2samp(data1_absolute, data2_absolute) ttest_absolute, p_ttest_absolute = ttest_ind(data1_absolute, data2_absolute, equal_var=False) # estimation of confidence intervals with bootstrap method, https://github.com/facebookincubator/bootstrapped res_final = bs.bootstrap_ab(data1_final, data2_final, bs_stats.mean,bs_compare.difference, num_iterations=10000) sign_final = np.sign(res_final.upper_bound)==np.sign(res_final.lower_bound) res_absolute = bs.bootstrap_ab(data1_absolute, data2_absolute, bs_stats.mean,bs_compare.difference, num_iterations=10000) sign_absolute = np.sign(res_absolute.upper_bound) == np.sign(res_absolute.lower_bound) toSave=np.zeros([4,4]) toSave[0:2,:] = np.array([[ks_final, p_ks_final, ttest_final, p_ttest_final],[ks_absolute, p_ks_absolute, ttest_absolute, p_ttest_absolute]]) toSave[2,:] = np.array([res_final.value,res_final.lower_bound,res_final.upper_bound,sign_final*np.sign(res_final.lower_bound)]) toSave[3,:] = np.array([res_absolute.value,res_absolute.lower_bound,res_absolute.upper_bound,sign_absolute*np.sign(res_absolute.lower_bound)]) np.savetxt(data_path + 'stats', toSave) print('Done.')
def bootstrap_test(data1, data2, alpha=0.05): """ Wraps around bootstrap test from https://github.com/facebookincubator/bootstrapped/. Params ------ - data1 (ndarray of dim 1) The performance measures of Algo1. - data2 (ndarray of dim 1) The performance measures of Algo2. - alpha (float in ]0,1[) The significance level used by the Welch's t-test. """ data1 = data1.squeeze() data2 = data2.squeeze() assert alpha <1 and alpha >0, "alpha should be between 0 and 1" res = bs.bootstrap_ab(data1, data2, bs_stats.mean, bs_compare.difference, alpha=alpha, num_iterations=10000) decision = np.sign(res.upper_bound) == np.sign(res.lower_bound) if decision: if np.sign(res.upper_bound)<0: print("\n\nResult of the bootstrap test at level %02g: μ2>μ1, the test passed with a confidence interval μ1-μ2 in %02g, %02g." % (alpha, res.lower_bound, res.upper_bound)) else: print("\n\nResult of the bootstrap test level %02g: μ1>μ2, the test passed with a confidence interval μ1-μ2 in %02g, %02g." % (alpha, res.lower_bound, res.upper_bound)) else: print("\n\nResults of the bootstrap test level %02g: there is not enough evidence to prove any order relation between μ1 and μ2." % alpha) print("Bootstrap test done.")
def bootstrapped_mean_difference_interval_for_continuous( data1, data2, alpha=0.05): """ Return the difference of bootstrapped means for continuous metric Parameters ---------- data1, data2 : One-dimension arrays with [0, 1] values. alpha : The alpha value for the confidence intervals. Returns ------- bootstrapped_interval : The bootstrap confidence interval for a given distribution. """ data1['denominator'] = 1 data2['denominator'] = 1 bootstrapped_interval = bs.bootstrap_ab( data1, data2, stat_func=bs_stats.sum, compare_func=bs_compare.difference, test_denominator=data1['denominator'], ctrl_denominator=data2['denominator'], alpha=alpha, return_distribution=False) return bootstrapped_interval
def run_simulation2(data, data2): results = [] for i in range(3000): results.append( bas.bootstrap_ab(data, data2, bs_stats.mean, bs_compare.percent_change)) return results
def run_test(test_id, data1, data2, alpha=0.05): """ Compute tests comparing data1 and data2 with confidence level alpha :param test_id: (str) refers to what test should be used :param data1: (np.ndarray) sample 1 :param data2: (np.ndarray) sample 2 :param alpha: (float) confidence level of the test :return: (bool) if True, the null hypothesis is rejected """ data1 = data1.squeeze() data2 = data2.squeeze() n1 = data1.size n2 = data2.size if test_id == 'bootstrap': assert alpha < 1 and alpha > 0, "alpha should be between 0 and 1" res = bs.bootstrap_ab(data1, data2, bs_stats.mean, bs_compare.difference, alpha=alpha, num_iterations=1000) rejection = np.sign(res.upper_bound) == np.sign(res.lower_bound) return rejection elif test_id == 't-test': _, p = ttest_ind(data1, data2, equal_var=True) return p < alpha elif test_id == "Welch t-test": _, p = ttest_ind(data1, data2, equal_var=False) return p < alpha elif test_id == 'Mann-Whitney': _, p = mannwhitneyu(data1, data2, alternative='two-sided') return p < alpha elif test_id == 'Ranked t-test': all_data = np.concatenate([data1.copy(), data2.copy()], axis=0) ranks = rankdata(all_data) ranks1 = ranks[:n1] ranks2 = ranks[n1:n1 + n2] assert ranks2.size == n2 _, p = ttest_ind(ranks1, ranks2, equal_var=True) return p < alpha elif test_id == 'permutation': all_data = np.concatenate([data1.copy(), data2.copy()], axis=0) delta = np.abs(data1.mean() - data2.mean()) num_samples = 1000 estimates = [] for _ in range(num_samples): estimates.append(run_permutation_test(all_data.copy(), n1, n2)) estimates = np.abs(np.array(estimates)) diff_count = len(np.where(estimates <= delta)[0]) return (1.0 - (float(diff_count) / float(num_samples))) < alpha else: raise NotImplementedError
def run_simulation(data): lift = 1.25 results = [] for i in range(3000): random.shuffle(data) test = data[:len(data) / 2] * lift ctrl = data[len(data) / 2:] results.append( bas.bootstrap_ab(test, ctrl, bs_stats.mean, bs_compare.percent_change)) return results
def empirical_false_pos_rate(data, alpha=0.05): """ Compute and plot empirical estimates of the probability of type-I error given a list of performance measures. If this list is of size N_data This is done for N=2:floor(N_data/2). Two different tests are used: the bootstrap confidence interval test and the Welch's t-test, both with significance level alpha. Params ------ - data1 (ndarray of dim 1) The performance measures of the considered algorithm. - alpha (float in ]0,1[) The significance level used by the two tests. """ print('\n\nComputing empirical false positive rate ..') data = data.squeeze() sizes = range(2, data.size//2) nb_reps = 1000 results = np.zeros([nb_reps, len(sizes), 2]) blue = [0,0.447,0.7410,1] orange = [0.85,0.325,0.098,1] for i_n, n in enumerate(sizes): print(' N =', n) ind = list(range(2*n)) for rep in range(nb_reps): # take two groups of size n in data, at random np.random.shuffle(ind) sample_1 = data[ind[:n]] sample_2 = data[ind[n:2*n]] # perform the two-tail Welch's t-test results[rep, i_n, 0] = stats.ttest_ind(sample_1, sample_2, equal_var=False)[1] < alpha # perform the bootstrap confidence interval test res_final = bs.bootstrap_ab(sample_1, sample_2, bs_stats.mean, bs_compare.difference, num_iterations=10000) results[rep, i_n, 1] = np.sign(res_final.upper_bound) == np.sign(res_final.lower_bound) res_mean = results.mean(axis=0) plt.figure(figsize=(16,10), frameon=False) plt.plot(sizes, alpha * np.ones(len(sizes)), c='k', linewidth=5, linestyle='--') plt.plot(sizes, res_mean[:,0], color=blue, linewidth=4) plt.plot(sizes, res_mean[:,1], color=orange, linewidth=4) plt.legend([u'α=%02d'%alpha] + ["Welch's $t$-test", 'bootstrap test']) plt.xlabel('sample size (N)') plt.ylabel('P(false positive)') plt.title(u'Estimation of type-I error rate as a function of $N$ when $α=0.05$') print("\n Given N=%i and α=%02g, you can expect false positive rates: \n For the Welch's t-test: %02g \n For the bootstrap test: %02g." % (data.size //2, alpha, res_mean[-1,0], res_mean[-1,1] )) print('Done.')
def bootstrap_effect_size(test, ctrl) -> Tuple[float, Tuple[float, float]]: """ Parameters ---------- test ctrl Returns ------- (effect size, (lower bound, upper bound)) """ comp = bs.bootstrap_ab(np.asarray(test), np.asarray(ctrl), bs_stats.median, bs_compare.difference, num_threads=-1) return comp.value, (comp.lower_bound, comp.upper_bound)
def test_hypothesis(group1, group2, name, group_names): group1 = group1.groupby( ['sessionID']).median()["rating"].rename(group_names[0]) group2 = group2.groupby( ['sessionID']).median()["rating"].rename(group_names[1]) likert_plot_hypo(group1, group2, name) print("Mittelwert von ", group1.name, "-", group1.mean()) print("Mittelwert von ", group2.name, "-", group2.mean()) bs_result = bs.bootstrap_ab(group1.to_numpy(), group2.to_numpy(), bs_stats.median, bs_compare.percent_change) mean_change = bs_compare.percent_change(group1.mean(), group2.mean()) print("Bootstrap Ergebnis:", bs_result) print("Unterschied im Mittelwert von", group1.name, "zu", group2.name, "(in Prozent)", mean_change) print("Ist der Unterschied signifikant?", bs_result.is_significant())
def bootstrapped_mean_difference_distribution_for_continuous(data1, data2): """ Return the distribution of bootstrapped mean difference for continuous metric Parameters ---------- data1, data2 : One-dimension arrays. Returns ------- bootstrapped_mean_difference : Distribution of the mean difference """ bootstrapped_mean_difference = bs.bootstrap_ab( data1, data2, stat_func=bs_stats.mean, compare_func=bs_compare.difference, return_distribution=True) return bootstrapped_mean_difference
def bootstrapped_mean_difference_interval_for_continuous( data1, data2, alpha=0.05): """ Return the difference of bootstrapped means for continuous metric Parameters ---------- data1, data2 : One-dimension arrays. alpha : The alpha value for the confidence intervals. Returns ------- bootstrapped_interval : The bootstrap confidence interval for a given distribution. """ bootstrapped_interval = bs.bootstrap_ab(data1, data2, stat_func=bs_stats.mean, compare_func=bs_compare.difference, alpha=alpha, return_distribution=False) return bootstrapped_interval
def bootstrapped_mean_difference_distribution_for_binomial(data1, data2): """ Return the distribution of bootstrapped mean difference for binomial metric Parameters ---------- data1, data2 : One-dimension arrays with [0, 1] values. Returns ------- bootstrapped_mean_difference : Distribution of the mean difference """ data1['denominator'] = 1 data2['denominator'] = 1 bootstrapped_mean_difference = bs.bootstrap_ab( data1, data2, stat_func=bs_stats.sum, compare_func=bs_compare.difference, test_denominator=data1['denominator'], ctrl_denominator=data2['denominator'], return_distribution=True) return bootstrapped_mean_difference
mean = 100 stdev = 10 samples = np.random.normal(loc=mean, scale=stdev, size=5000) samples_t = np.random.normal(loc=mean, scale=stdev, size=5000) bsr = bs.bootstrap(samples, bs_stats.mean) print(bsr) bsr2 = bs.bootstrap(samples, bs_stats.mean, method="pi") print("pi:") print(bsr2) bsr3 = bs.bootstrap(samples, bs_stats.trimmed_mean) print("trimmed mean:") print(bsr3) bsr4 = bs.bootstrap_ab(samples, samples_t, bs_stats.trimmed_mean, bs_compare.percent_change) print("ab:") print(bsr4) bsr5 = bs.bootstrap_ab(samples, samples_t, bs_stats.trimmed_mean, bs_compare.percent_change, method="pi") print("ab pi:") print(bsr5)
def run_simulation(group1, group2): results = [] for i in range(3000): results.append(bs.bootstrap_ab(group1.to_numpy(), group2.to_numpy(), bs_stats.sum, bs_compare.percent_change)) return results
def real_pv_test(sample_size): df = pd.read_csv("0928-ctr.sql", sep='\t') df_1005 = pd.read_csv("1005-ctr.sql", sep='\t') total_pv = float(np.mean(df["exp_pv"])) total_pv_1005 = float(np.mean(df_1005["exp_pv"])) pv_diff = total_pv - total_pv_1005 p_out, p_in, flag = 0, 0, 0 zero_out, zero_in, zero_flag = 0, 0, 0 ab_out, ab_in, ab_flag = 0, 0, 0 bucket_num = 50 #total_click_pv = float(np.mean(df["cli_pv"])) p_out, p_in, flag = 0, 0, 0 c_out, c_in, c_flag = 0, 0, 0 split_num = sample_size / bucket_num for i in range(0, 1000): print("{0}th sample--------------------".format(i)) buck_index = np.floor(np.arange(0, sample_size) / split_num) #filename1 = "data/0928A{0}_{1}".format(sample_size,i) filename1, filename2, filename3 = "data/0928A1{1}_{0}".format( i, sample_size), "data/0928A2{1}_{0}".format( i, sample_size), "data/1005B{1}_{0}".format(i, sample_size) if os.path.exists(filename1) and os.path.exists( filename2) and os.path.exists(filename3): sample1, sample2, sample3 = pd.read_csv( filename1, sep='\t'), pd.read_csv(filename2, sep='\t'), pd.read_csv(filename3, sep='\t') else: sample1, sample2, sample3 = df.sample(n=sample_size), df.sample( n=sample_size), df_1005.sample(n=sample_size) sample1["bucket_index"], sample2["bucket_index"], sample3[ "bucket_index"] = buck_index, buck_index, buck_index sample1.to_csv(filename1, sep='\t') sample2.to_csv(filename2, sep='\t') sample3.to_csv(filename3, sep='\t') sample_0928 = sample1.groupby([ 'bucket_index' ])["cli_pv", "exp_pv"].mean().add_suffix('_sum').reset_index() sample_0928_1 = sample2.groupby([ 'bucket_index' ])["cli_pv", "exp_pv"].mean().add_suffix('_sum').reset_index() sample_1005 = sample3.groupby([ 'bucket_index' ])["cli_pv", "exp_pv"].mean().add_suffix('_sum').reset_index() #####bootstrap####### ####total r = bs.bootstrap(sample_0928.exp_pv_sum.values, bs_stats.mean) point, low, high = r.value, r.lower_bound, r.upper_bound if total_pv >= low and total_pv <= high: p_in = p_in + 1 flag = 1 else: p_out = p_out + 1 flag = 0 print( "total,flag:{0}, diff:{1}, real:{2}, low:{3}, high:{4}, width:{5}". format(flag, point - total_pv, total_pv, low, high, high - low)) ####aa r = bs.bootstrap_ab(sample_0928.exp_pv_sum.values, sample_0928_1.exp_pv_sum.values, stat_func=bs_stats.mean, compare_func=bs_compare.difference) # r = bs.bootstrap_ab(sample_0928.cli_pv_sum.values / sample_0928.exp_pv_sum.values, # sample_0928_1.cli_pv_sum.values / sample_0928_1.exp_pv_sum.values, # stat_func=bs_stats.mean, # compare_func=bs_compare.difference) point, low, high = r.value, r.lower_bound, r.upper_bound zero = 0.0 if zero >= low and zero <= high: zero_in = zero_in + 1 zero_flag = 1 else: zero_out = zero_out + 1 zero_flag = 0 print("flag:{0}, diff:{1}, real:{2}, low:{3}, high:{4}, width:{5}". format(zero_flag, point - zero, 0.0, low, high, high - low)) ####ab r = bs.bootstrap_ab(sample_0928.exp_pv_sum.values, sample_1005.exp_pv_sum.values, stat_func=bs_stats.mean, compare_func=bs_compare.difference) point, low, high = r.value, r.lower_bound, r.upper_bound if pv_diff >= low and pv_diff <= high: ab_in = ab_in + 1 ab_flag = 1 else: ab_out = ab_out + 1 ab_flag = 0 print("flag:{0}, diff:{1}, real:{2}, low:{3}, high:{4}, width:{5}". format(ab_flag, point - pv_diff, pv_diff, low, high, high - low)) if i % 50 == 0 or i / 50 == 0 or i == 999: # print("sample_size:{2},total,notcover:{0},cover:{1}".format(p_out, p_in,sample_size)) # print("sample_size:{2},total,notcover:{0},cover:{1}".format(c_out, c_in,sample_size)) print("total,not cover:{0},cover:{1}".format(p_out, p_in)) print("aatest,not cover:{0},cover:{1}".format(zero_out, zero_in)) print("abtest,not cover:{0},cover:{1}".format(ab_out, ab_in)) print("end")
elif ((pval < 0.05) and (st is not None)): print('%s Different average' % (test)) else: print('%s is not applicapable' % (test)) return 0 # 1. Sub-bucket Method data['subbucket'] = data['user_id'].apply(lambda x: randint(0,1000)) # Variant 1 data['subbucket'] = data['user_id'].apply(lambda x: hash(x)%1000) # Variant 2 # 2. Bootstrap Method data_a = data[data['group'] == 'experiment_buckets'] data_b = data[data['group'] == 'control_buckets'] bs_ab_estims = bs.bootstrap_ab(data_a.groupby(data['user_id']).target.sum().values, data_b.groupby(data['user_id']).target.sum().values, bs_stats.mean, bs_compare.percent_change, num_iterations=5000, alpha=0.10, iteration_batch_size=100, scale_test_by=1, num_threads=4) bs_data_a = bs.bootstrap(data_a.groupby(data['user_id']).target.sum().values, stat_func=bs_stats.mean, num_iterations=10000, iteration_batch_size=300, return_distribution=True) bs_data_b = bs.bootstrap(data_b.groupby(data['user_id']).target.sum().values, stat_func=bs_stats.mean, num_iterations=10000, iteration_batch_size=300, return_distribution=True) # Task: # 1. Сгенерируйте данные следующего формата: # group, bucket_id, user_id, target # target - целевая переменная; # bucket_id лежит в диапазоне от 0 до 10;
def compute_stats_vs(data_path, n1, n2): run = {} run['perfs'] = [] scores_our = [] for i, trial in enumerate(sorted(os.listdir(data_path))): if len(trial) < 3: print('Extracting: ', trial) filename = data_path + trial + '/progress.json' # print(filename) steps, eval_rewards = extract_performances(filename) run['perfs'].append(eval_rewards) scores = np.loadtxt(data_path + trial + '/scores') scores_our.append(scores[1, 0]) n_runs = len(run['perfs']) assert n1 + n2 == n_runs max_steps = 0 for i in range(n_runs): if len(run['perfs'][i]) > max_steps: max_steps = len(run['perfs'][i]) eval_perfs = np.empty([n_runs, max_steps]) * (np.nan) for i in range(n_runs): eval_perfs[i, :len(run['perfs'][i])] = run['perfs'][i] # steps = steps[:700] inds = np.array(range(n_runs)) # np.random.shuffle(inds) # modify GEP source for i in range(n1, n2): eval_perfs[i, 251:] = eval_perfs[i, :750] eval_perfs[i, :251] = np.zeros([251]) print(inds) # compute statistics data1_litt = np.nanmean(eval_perfs[inds[:n1]][:, -10:], axis=1) data2_litt = np.nanmean(eval_perfs[inds[n1:]][:, -10:], axis=1) ks_litt, p_ks_litt = ks_2samp(data1_litt, data2_litt) ttest_litt, p_ttest_litt = ttest_ind(data1_litt, data2_litt, equal_var=False) data1_our = np.array(scores_our[:n1]) data2_our = np.array(scores_our[n1:]) ks_our, p_ks_our = ks_2samp(data1_our, data2_our) ttest_our, p_ttest_our = ttest_ind(data1_our, data2_our, equal_var=False) # estimation of confidence intervals with bootstrap method, https://github.com/facebookincubator/bootstrapped res_litt = bs.bootstrap_ab(data1_litt, data2_litt, bs_stats.mean, bs_compare.difference, num_iterations=10000) sign_litt = np.sign(res_litt.upper_bound) == np.sign(res_litt.lower_bound) res_our = bs.bootstrap_ab(data1_our, data2_our, bs_stats.mean, bs_compare.difference, num_iterations=10000) sign_our = np.sign(res_our.upper_bound) == np.sign(res_our.lower_bound) toSave = np.zeros([4, 4]) toSave[0:2, :] = np.array([[ks_litt, p_ks_litt, ttest_litt, p_ttest_litt], [ks_our, p_ks_our, ttest_our, p_ttest_our]]) toSave[2, :] = np.array([ res_litt.value, res_litt.lower_bound, res_litt.upper_bound, sign_litt * np.sign(res_litt.lower_bound) ]) toSave[3, :] = np.array([ res_our.value, res_our.lower_bound, res_our.upper_bound, sign_our * np.sign(res_our.lower_bound) ]) np.savetxt(data_path + 'stats', toSave)
bs_compare.percent_change)) return results def run_simulation2(data, data2): results = [] for i in range(3000): results.append( bas.bootstrap_ab(data, data2, bs_stats.mean, bs_compare.percent_change)) return results print( "bootstrap a/b", bas.bootstrap_ab(final_average_return, final_average_return2, bs_stats.mean, bs_compare.percent_change)) bab = bas.bootstrap_ab(final_average_return, final_average_return2, bs_stats.mean, bs_compare.percent_change) x = run_simulation2(final_average_return, final_average_return2) bootstrap_ab = bs_power.power_stats(x) print("power analysis bootstrap a/b") print(bootstrap_ab) print("***********************************************") print("Bootstrap analysis") print("***********************************************") print("***Arg1****") sim = bas.bootstrap(final_average_return, stat_func=bs_stats.mean) print("%.2f (%.2f, %.2f)" % (sim.value, sim.lower_bound, sim.upper_bound)) print("***Arg2****") sim = bas.bootstrap(final_average_return2, stat_func=bs_stats.mean)