def test_weightstats_ddof_tests(self): # explicit test that ttest and confint are independent of ddof # one sample case x1_2d = self.x1_2d w1 = self.w1 d1w_d0 = DescrStatsW(x1_2d, weights=w1, ddof=0) d1w_d1 = DescrStatsW(x1_2d, weights=w1, ddof=1) d1w_d2 = DescrStatsW(x1_2d, weights=w1, ddof=2) # check confint independent of user ddof res0 = d1w_d0.ttest_mean() res1 = d1w_d1.ttest_mean() res2 = d1w_d2.ttest_mean() # concatenate into one array with np.r_ assert_almost_equal(np.r_[res1], np.r_[res0], 14) assert_almost_equal(np.r_[res2], np.r_[res0], 14) res0 = d1w_d0.ttest_mean(0.5) res1 = d1w_d1.ttest_mean(0.5) res2 = d1w_d2.ttest_mean(0.5) assert_almost_equal(np.r_[res1], np.r_[res0], 14) assert_almost_equal(np.r_[res2], np.r_[res0], 14) # check confint independent of user ddof res0 = d1w_d0.tconfint_mean() res1 = d1w_d1.tconfint_mean() res2 = d1w_d2.tconfint_mean() assert_almost_equal(res1, res0, 14) assert_almost_equal(res2, res0, 14)
def test_weightstats_ddof_tests(self): # explicit test that ttest and confint are independent of ddof # one sample case x1_2d = self.x1_2d w1 = self.w1 d1w_d0 = DescrStatsW(x1_2d, weights=w1, ddof=0) d1w_d1 = DescrStatsW(x1_2d, weights=w1, ddof=1) d1w_d2 = DescrStatsW(x1_2d, weights=w1, ddof=2) #check confint independent of user ddof res0 = d1w_d0.ttest_mean() res1 = d1w_d1.ttest_mean() res2 = d1w_d2.ttest_mean() # concatenate into one array with np.r_ assert_almost_equal(np.r_[res1], np.r_[res0], 14) assert_almost_equal(np.r_[res2], np.r_[res0], 14) res0 = d1w_d0.ttest_mean(0.5) res1 = d1w_d1.ttest_mean(0.5) res2 = d1w_d2.ttest_mean(0.5) assert_almost_equal(np.r_[res1], np.r_[res0], 14) assert_almost_equal(np.r_[res2], np.r_[res0], 14) #check confint independent of user ddof res0 = d1w_d0.tconfint_mean() res1 = d1w_d1.tconfint_mean() res2 = d1w_d2.tconfint_mean() assert_almost_equal(res1, res0, 14) assert_almost_equal(res2, res0, 14)
def summarize(name, test=False, space_creator=space_optimized): print("---------------------") print(f"{name}> summarizing") files = list(glob.glob(f"output/{name}/*_results.txt")) print(f"{name}> {len(files)} files found") failed = 0 timed_out = 0 memory_error = 0 results = [] for file in files: log_file = file.replace("_results.txt", "_log.txt") with open(log_file) as f: lines = "\n".join(f.readlines()) if "out of memory" in lines: failed += 1 memory_error += 1 print(f"{name}> {log_file} out of memory") continue with open(file) as f: lines = f.readlines() time_taken = int(lines[3]) / 1000 / 60 # minutes if lines[5].startswith("False"): print(f"{name}> {file} timed out") failed += 1 timed_out += 1 continue results.append(time_taken) if test: check(log_file, stop_on_error=True, space_creator=space_creator) success_count = len(results) print(f"{name}> {success_count} successes") if success_count != 0: d = DescrStatsW(results) print(f"{name}> mean={d.mean}, std_mean={d.std_mean}") print(f"{name}> confidence interval", d.tconfint_mean()) print( f"{name}> {failed} failed, {failed / (success_count + failed) * 100}%") print(f"{name}> {timed_out} timed out") print(f"{name}> {memory_error} memory error") print( f"{name}> binomial success ci={proportion_confint(success_count, success_count + failed, method='wilson')}" ) frame = frame_of(results, name) plt.figure() sns.boxplot(data=frame, y="time") plt.savefig(f"output/fig{name}.png") plt.figure() sns.displot(frame["time"]) plt.ylim(0, 350) plt.savefig(f"output/fig{name}-dist.png") return results
def grouped_weights_statsdf(df, statscols, groupbycol, weightscol): """generates df with weighted means and 95% CI by groupbycol for cols in statscols Parameters ---------- df : df df to be weigthed statscols : list cols/outcomes for weigthed stats groupbycol : str column name in df that defines groups weightscol : str column name in df with weigths Returns ------- df multi-indexed df with outcome and groups as index stats generated: weighted mean, upper bound (95 CI), lower bound (95% CI), weighted n by group, total n unweighted """ alldata = pd.DataFrame() for c in statscols: cdf = df.dropna(subset=[c]) nrobs = len(cdf) grouped = cdf.groupby(groupbycol) stats = {} means = [] lower = [] upper = [] nrobs_gr = [] groups = list(grouped.groups.keys()) for gr in groups: stats = DescrStatsW(grouped.get_group(gr)[c], weights=grouped.get_group(gr)[weightscol], ddof=0) means.append(stats.mean) lower.append(stats.tconfint_mean()[0]) upper.append(stats.tconfint_mean()[1]) nrobs_gr.append(stats.nobs) weightedstats = pd.DataFrame([means, lower, upper, nrobs_gr], columns=groups, index=[ 'weighted mean', 'lower bound', 'upper bound', 'wei_n__group' ]).T weightedstats['tot_n_unweigthed'] = nrobs weightedstats['outcome'] = c weightedstats.index.name = 'groups' colstats = weightedstats.reset_index() colstats = colstats.set_index(['outcome', 'groups']) alldata = pd.concat([alldata, colstats]) return alldata
def one_t_test(pdf, data_measlevs, var_name, test_value=0): """One sample t-test arguments: var_name (str): Name of the variable to test. test_value (numeric): Test against this value. return: text_result (html str): Result in APA format. image (matplotlib): Bar chart with mean and confidence interval. """ text_result = '' data = pdf[var_name].dropna() if data_measlevs[var_name] in ['int', 'unk']: if data_measlevs[var_name] == 'unk': text_result += warn_unknown_variable if len(set(data)) == 1: return _('One sample t-test cannot be run for constant variable.\n' ), None data = pdf[var_name].dropna() descr = DescrStatsW(data) t, p, df = descr.ttest_mean(float(test_value)) if LooseVersion(csc.versions['statsmodels']) >= LooseVersion('0.5'): # Or we could use confidence_interval_t cil, cih = descr.tconfint_mean() ci = (cih - cil) / 2 prec = cs_util.precision(data) + 1 ci_text = '[%0.*f, %0.*f]' % (prec, cil, prec, cih) else: ci = 0 # only with statsmodels ci_text = _( 'Sorry, newer statsmodels module is required for confidence interval.\n' ) text_result += _('One sample t-test against %g') % float( test_value) + ': <i>t</i>(%d) = %0.3g, %s\n' % (df, t, cs_util.print_p(p)) # Graph image = cs_chart.create_variable_population_chart(data, var_name, ci) else: text_result += _( 'One sample t-test is computed only for interval variables.') image = None return ci_text, text_result, image
def grouped_weights_statscol (df, statscol, groupbycol, weightscol): df.dropna(subset=[statscol], inplace=True) nrobs=len(df) grouped=df.groupby(groupbycol) stats={} means=[] lower=[] upper=[] groups=list(grouped.groups.keys()) for gr in groups: stats=DescrStatsW(grouped.get_group(gr)[statscol], weights=grouped.get_group(gr)[weightscol], ddof=0) means.append(stats.mean) lower.append(stats.tconfint_mean()[0]) upper.append(stats.tconfint_mean()[1]) weightedstats=pd.DataFrame([means, lower, upper], columns=groups, index=['weighted mean', 'lower bound', 'upper bound']).T weightedstats['numberofobs']=nrobs return weightedstats
def confidence_interval_t(data, ci_only=True): """95%, two-sided CI based on t-distribution http://statsmodels.sourceforge.net/stable/_modules/statsmodels/stats/weightstats.html#DescrStatsW.tconfint_mean """ # FIXME is this solution slow? Should we write our own CI function? if LooseVersion(csc.versions['statsmodels']) >= LooseVersion('0.5'): descr = DescrStatsW(data) cil, cih = descr.tconfint_mean() ci = (cih - cil) / 2 else: cil = cih = ci = [None for i in data] # FIXME maybe this one is not correct if ci_only: if isinstance(data, pd.Series): return ci # FIXME this one is for series? The other is for dataframes? elif isinstance(data, pd.DataFrame): return pd.Series(ci, index=data.columns) # without var names the call from comp_group_graph_cum fails else: return ci, cil, cih
my_knn_socres = cross_val_score(KNeighborsRegressor(n_neighbors=5), X, y, cv=LeaveOneOut(), scoring='neg_mean_squared_error') (-my_lm_scores.mean())**0.5 #> 15.697306009399101 # 線形回帰分析 (-my_knn_socres.mean())**0.5 #> 16.07308308943869 # K最近傍法 my_df = pd.DataFrame({'lm': -my_lm_scores, 'knn': -my_knn_socres}) my_df.head() #> lm knn #> 0 18.913720 108.16 #> 1 179.215044 0.64 #> 2 41.034336 64.00 #> 3 168.490212 184.96 #> 4 5.085308 0.00 my_df.boxplot().set_ylabel("$r^2$") from statsmodels.stats.weightstats import DescrStatsW d = DescrStatsW(my_df.lm - my_df.knn) d.ttest_mean()[1] # p値 #> 0.6952755720536115 d.tconfint_mean(alpha=0.05, alternative='two-sided') # 信頼区間 #> (-72.8275283312228, 48.95036023665703)
def __call__(self, alpha, df): stats = DescrStatsW(df) (lower, upper) = stats.tconfint_mean(alpha=alpha) return Band(lower, upper)
np.random.seed(75243) temp = nota_media_dos_filmes_com_pelo_menos_10_votos.sample(frac=1) medias = [temp[0:i].mean() for i in range(1, len(temp))] plt.plot(medias) from statsmodels.stats.weightstats import zconfint zconfint(nota_media_dos_filmes_com_pelo_menos_10_votos) from statsmodels.stats.weightstats import DescrStatsW descr_todos_com_10_votos = DescrStatsW(nota_media_dos_filmes_com_pelo_menos_10_votos) descr_todos_com_10_votos.tconfint_mean() """# Vamos ver o filme 1...""" filmes = pd.read_csv("movies.csv") filmes.query("movieId==1") notas1 = notas.query("movieId == 1") notas1.head() ax = sns.distplot(notas1.rating) ax.set(xlabel="Nota", ylabel="Densidade") ax.set_title("Distribuição das notas para o Toy Story") ax = sns.boxplot(notas1.rating) ax.set(xlabel="Nota")
] Y = [ 35.4, 34.6, 31.1, 32.4, 33.3, 34.7, 35.3, 34.3, 32.1, 28.3, 33.3, 30.5, 32.6, 33.3, 32.2 ] a = 0.05 # 有意水準(デフォルト) = 1 - 信頼係数 alt = 'two-sided' # 両側検定(デフォルト) # 左片側検定なら'smaller' # 右片側検定なら'larger' d = DescrStatsW(np.array(X) - np.array(Y)) # 対標本の場合 d.ttest_mean(alternative=alt)[1] # p値 #> 0.0006415571512322235 d.tconfint_mean(alpha=a, alternative=alt) # 信頼区間 #> (-3.9955246743198867, -1.3644753256801117) c = CompareMeans(DescrStatsW(X), DescrStatsW(Y)) # 対標本でない場合 ve = 'pooled' # 等分散を仮定する(デフォルト).仮定しないなら'unequal'. c.ttest_ind(alternative=alt, usevar=ve)[1] # p値 #> 0.000978530937238609 c.tconfint_diff(alpha=a, alternative=alt, usevar=ve) # 信頼区間 #> (-4.170905570517185, -1.1890944294828283) ### 4.4.4 独立性の検定(カイ2乗検定) import pandas as pd my_url = ('https://raw.githubusercontent.com/taroyabuki'
diagnostico_b = df.query("diagnosis == 'B'") # Efetuando o Zteste para a média (Comparando os resultados) ztest(diagnostico_m['mean_radius'], value = diagnostico_m['mean_radius'].mean()) ztest(diagnostico_m['mean_radius'], value = diagnostico_b['mean_radius'].mean()) # Gerando o intervalo de confiança zconfint(diagnostico_m['mean_radius']) zconfint(diagnostico_b['mean_radius']) """---------------------------------------------------------------------------- T Test """ diagnostico_m = df.query("diagnosis == 'M'") diagnostico_b = df.query("diagnosis == 'B'") # Aplicando o teste resultados_m = DescrStatsW(diagnostico_m['mean_radius']) resultados_b = DescrStatsW(diagnostico_b['mean_radius']) # Gerando o intervalo de confiança resultados_m.tconfint_mean() resultados_b.tconfint_mean()