def show_continuous(): """Show a variety of continuous distributions""" x = linspace(-10, 10, 201) # Normal distribution showDistribution(x, stats.norm, stats.norm(loc=2, scale=4), "Normal Distribution", "Z", "P(Z)", "") # Exponential distribution showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4), "Exponential Distribution", "X", "P(X)", "") # Students' T-distribution # ... with 4, and with 10 degrees of freedom (DOF) plot(x, stats.norm.pdf(x), "g") hold(True) showDistribution(x, stats.t(4), stats.t(10), "T-Distribution", "X", "P(X)", ["normal", "t=4", "t=10"]) # F-distribution # ... with (3,4) and (10,15) DOF showDistribution(x, stats.f(3, 4), stats.f(10, 15), "F-Distribution", "F", "P(F)", ["(3,4) DOF", "(10,15) DOF"]) # Weibull distribution # ... with the shape parameter set to 1 and 2 # Don't worry that in Python it is called "weibull_min": the "weibull_max" is # simply mirrored about the origin. showDistribution( arange(0, 5, 0.02), stats.weibull_min(1), stats.weibull_min(2), "Weibull Distribution", "X", "P(X)", ["k=1", "k=2"], xmin=0, xmax=4, ) # Uniform distribution showDistribution(x, stats.uniform, "", "Uniform Distribution", "X", "P(X)", "") # Logistic distribution showDistribution(x, stats.norm, stats.logistic, "Logistic Distribution", "X", "P(X)", ["Normal", "Logistic"]) # Lognormal distribution x = logspace(-9, 1, 1001) + 1e-9 showDistribution(x, stats.lognorm(2), "", "Lognormal Distribution", "X", "lognorm(X)", "", xmin=-0.1) # The log-lin plot has to be done by hand: plot(log(x), stats.lognorm.pdf(x, 2)) xlim(-10, 4) title("Lognormal Distribution") xlabel("log(X)") ylabel("lognorm(X)") show()
def ftest(data1, data2, alpha=0.05): alpha1 = alpha Sd1 = np.var(data1) Sd2 = np.var(data2) n1 = len(data1) n2 = len(data2) F = Sd1 / Sd2 #F统计量 if F > stats.f(n1 - 1, n2 - 1).ppf(1 - alpha / 2) or F < stats.f( n1 - 1, n2 - 1).ppf(alpha / 2): print("Reject H0 at the significance level of", alpha1, ".") else: print("Accept H0 at the significance level of", alpha1, ".")
def show_continuous(): """Show a variety of continuous distributions""" x = linspace(-10,10,201) # Normal distribution showDistribution(x, stats.norm, stats.norm(loc=2, scale=4), 'Normal Distribution', 'Z', 'P(Z)','') # Exponential distribution showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4), 'Exponential Distribution', 'X', 'P(X)','') # Students' T-distribution # ... with 4, and with 10 degrees of freedom (DOF) plot(x, stats.norm.pdf(x), 'g-.') hold(True) showDistribution(x, stats.t(4), stats.t(10), 'T-Distribution', 'X', 'P(X)',['normal', 't=4', 't=10']) # F-distribution # ... with (3,4) and (10,15) DOF showDistribution(x, stats.f(3,4), stats.f(10,15), 'F-Distribution', 'F', 'P(F)',['(3,4) DOF', '(10,15) DOF']) # Weibull distribution # ... with the shape parameter set to 1 and 2 # Don't worry that in Python it is called "weibull_min": the "weibull_max" is # simply mirrored about the origin. showDistribution(arange(0,5,0.02), stats.weibull_min(1), stats.weibull_min(2), 'Weibull Distribution', 'X', 'P(X)',['k=1', 'k=2'], xmin=0, xmax=4) # Uniform distribution showDistribution(x, stats.uniform,'' , 'Uniform Distribution', 'X', 'P(X)','') # Logistic distribution showDistribution(x, stats.norm, stats.logistic, 'Logistic Distribution', 'X', 'P(X)',['Normal', 'Logistic']) # Lognormal distribution x = logspace(-9,1,1001)+1e-9 showDistribution(x, stats.lognorm(2), '', 'Lognormal Distribution', 'X', 'lognorm(X)','', xmin=-0.1) # The log-lin plot has to be done by hand: plot(log(x), stats.lognorm.pdf(x,2)) xlim(-10, 4) title('Lognormal Distribution') xlabel('log(X)') ylabel('lognorm(X)') show()
def show_continuous(): """Show a variety of continuous distributions""" x = np.linspace(-10,10,201) # Normal distribution showDistribution(x, stats.norm, stats.norm(loc=2, scale=4), 'Normal Distribution', 'Z', 'P(Z)','') # Exponential distribution showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4), 'Exponential Distribution', 'X', 'P(X)','') # Students' T-distribution # ... with 4, and with 10 degrees of freedom (DOF) plt.plot(x, stats.norm.pdf(x), 'g-.') plt.hold(True) showDistribution(x, stats.t(4), stats.t(10), 'T-Distribution', 'X', 'P(X)',['normal', 't=4', 't=10']) # F-distribution # ... with (3,4) and (10,15) DOF showDistribution(x, stats.f(3,4), stats.f(10,15), 'F-Distribution', 'F', 'P(F)',['(3,4) DOF', '(10,15) DOF']) # Weibull distribution # ... with the shape parameter set to 1 and 2 # Don't worry that in Python it is called "weibull_min": the "weibull_max" is # simply mirrored about the origin. showDistribution(np.arange(0,5,0.02), stats.weibull_min(1), stats.weibull_min(2), 'Weibull Distribution', 'X', 'P(X)',['k=1', 'k=2'], xmin=0, xmax=4) # Uniform distribution showDistribution(x, stats.uniform,'' , 'Uniform Distribution', 'X', 'P(X)','') # Logistic distribution showDistribution(x, stats.norm, stats.logistic, 'Logistic Distribution', 'X', 'P(X)',['Normal', 'Logistic']) # Lognormal distribution x = np.logspace(-9,1,1001)+1e-9 showDistribution(x, stats.lognorm(2), '', 'Lognormal Distribution', 'X', 'lognorm(X)','', xmin=-0.1) # The log-lin plot has to be done by hand: plt.plot(np.log(x), stats.lognorm.pdf(x,2)) plt.xlim(-10, 4) plt.title('Lognormal Distribution') plt.xlabel('log(X)') plt.ylabel('lognorm(X)') plt.show()
def twoway_anova(data, alpha): ''' 双因素方差分析:涉及两个分类型自变量时的方差分析,这里只写一份无重复双因素的代码 Parameters ---------- data : R(m*n) 该自变量的多个水平的观测值表,标准的 m * n 二维矩阵 alpha: 执行F统计量假设检验的置信度 Returns ------- R: 自变量与因变量之间的关系强度, R**2 = SSR + SSC / SST ''' data = np.array(data) k, r = np.shape(data) # k为行数,r为列数 n = k * r # 总观测值 mrs = np.mean(data, axis=1) # 每行的均值 mcs = np.mean(data, axis=0) # 每列的均值 mt = np.mean(data) # 总体均值 sst = (np.var(data) * k * r).round(4) # 总体平方和 ssr = (np.sum((mrs - mt)**2) * r).round(4) # 行间平方和 ssc = (np.sum((mcs - mt)**2) * k).round(4) # 列间平方和 sse = (sst - ssr - ssc).round(4) # 随机误差 msr = (ssr / (k - 1)).round(4) msc = (ssc / (r - 1)).round(4) mse = (sse / ((k - 1) * (r - 1))).round(4) rvr = st.f(k - 1, (k - 1) * (r - 1)) fr = (msr / mse).round(4) # 检验行因素 fr_crit = rvr.ppf(1 - alpha).round(4) # 在当前置信度下的F临界值 pr = (1 - rvr.cdf(fr)).round(4) # P值 rvc = st.f(r - 1, (k - 1) * (r - 1)) fc = (msc / mse).round(4) # 检验行因素 fc_crit = rvc.ppf(1 - alpha).round(4) # 在当前置信度下的F临界值 pc = (1 - rvc.cdf(fc)).round(4) # P值 print('{0:-^97}'.format('')) print('{0:^10}|{1:^15}|{2:^10}|{3:^15}|{4:^15}|{5:^15}|{6:^15}'.format(\ 'Source', 'SS', 'df', 'MS', 'F', 'P-Value', 'F crit')) print('{0:-^97}'.format('')) print('{0:^10}|{1:^15}|{2:^10}|{3:^15}|{4:^15}|{5:^15}|{6:^15}'.format(\ 'Rows', ssr, k - 1, msr, fr, pr, fr_crit)) print('{0:^10}|{1:^15}|{2:^10}|{3:^15}|{4:^15}|{5:^15}|{6:^15}'.format(\ 'Cols', ssc, r - 1, msc, fc, pc, fc_crit)) print('{0:^10}|{1:^15}|{2:^10}|{3:^15}|'.format('Errors', sse, (k - 1) * (r - 1), mse)) print('{0:^10}|{1:^15}|{2:^10}|'.format('Total', sst, k * r - 1)) return (ssr + ssc) / sst
def blocked_anova(file, reps): data, _, treat_sums, block_sums, grand_total, n, n_treats, obs_per_treat = prep_data( file, reps) n_blocks = obs_per_treat // reps obs_per_block = n // n_blocks anova = {} sources = ['treats', 'blocks', 'err', 'total'] for item in sources: anova[item] = {} # DF's anova['total']['DF'] = n - 1 anova['treats']['DF'] = n_treats - 1 anova['blocks']['DF'] = n_blocks - 1 anova['err']['DF'] = anova['total']['DF'] - anova['treats']['DF'] - anova[ 'blocks']['DF'] # SS's anova['total']['SS'] = sum_xsq(data) - ((sum_x(data)**2) / n) ssTreats = 0 for i in range(n_treats): ssTreats += (treat_sums[i]**2) / obs_per_treat ssTreats -= (grand_total**2) / n anova['treats']['SS'] = ssTreats ssBlocks = 0 for i in range(n_blocks): ssBlocks += (block_sums[i]**2) / obs_per_block ssBlocks -= (grand_total**2) / n anova['blocks']['SS'] = ssBlocks anova['err']['SS'] = anova['total']['SS'] - anova['treats']['SS'] - anova[ 'blocks']['SS'] # MS's anova['treats']['MS'] = anova['treats']['SS'] / anova['treats']['DF'] anova['blocks']['MS'] = anova['blocks']['SS'] / anova['blocks']['DF'] anova['err']['MS'] = anova['err']['SS'] / anova['err']['DF'] # F anova['treats']['F'] = anova['treats']['MS'] / anova['err']['MS'] anova['blocks']['F'] = anova['blocks']['MS'] / anova['err']['MS'] # p anova['treats']['p'] = stats.f(anova['treats']['DF'], anova['err']['DF']).sf(anova['treats']['F']) anova['blocks']['p'] = stats.f(anova['blocks']['DF'], anova['err']['DF']).sf(anova['blocks']['F']) pretty_anova_tbl(anova, sources)
def F_test(s1, s2, n1, n2, H0, alpha=0.05): ''' F-Test for comparison of two variances. H0: σ1 ≤, ≥, = σ2. In slides 458. REQUIRE: H0 can take three values: "equal", "less", "greater". RETURN: Test statistics, critical value, p-value. ''' F = s1**2 / s2**2 if H0 == "less": c_value = stats.f(n1 - 1, n2 - 1).ppf(1 - alpha) p_value = 1 - stats.f(n1 - 1, n2 - 1).cdf(F) elif H0 == "greater": c_value = stats.f(n1 - 1, n2 - 1).ppf(alpha) p_value = stats.f(n1 - 1, n2 - 1).cdf(F) elif H0 == "equal": F1, F2 = F, 1 / F c_value1, c_value2 = stats.f(n1 - 1, n2 - 1).ppf(1 - alpha / 2), stats.f( n2 - 1, n1 - 1).ppf(1 - alpha / 2) F = (F1, F2) c_value = (c_value1, c_value2) p_value = 2 * min(1 - stats.f(n1 - 1, n2 - 1).cdf(F1), 1 - stats.f(n2 - 1, n1 - 1).cdf(F2)) return F, c_value, p_value
def FProbabilitiesLowerTail(values, dfn, dfd): if len(values) > 0 and dfn > 0 and dfd > 0: outputStr = "" areas = [] for val in values: outputStr += str(val) rv = stats.f(dfn, dfd, loc=0, scale=1) area = rv.cdf(val) area = "{0:.5f}".format(area) areas.append(area) if len(values) > 1 and values.index(val) < len(values) - 1: outputStr += ", " else: outputStr += "" outputStr += ", serbestlik derecesi (pay): " + str( dfn) + ", serbestlik derecesi (payda): " + str(dfd) return outputStr, areas elif dfn <= 0 or dfd <= 0: return False, "Serbestlik dereceleri 0'dan kucuk olamaz." else: return False, "Hesaplama icin gecerli degerler girilmelidir."
def FQuantilesLowerTail(probs, dfn, dfd): if len(probs) > 0 and dfn > 0 and dfd > 0: outputStr = "" yArray = [] for prob in probs: outputStr += str(prob) if prob > 0 and prob < 1: rv = stats.f(dfn, dfd, loc=0, scale=1) y = rv.ppf(prob) y = "{0:.5f}".format(y) yArray.append(y) else: yArray.append("NaN") if len(probs) > 1 and probs.index(prob) < len(probs) - 1: outputStr += ", " else: outputStr += "" outputStr += ", serbestlik derecesi (pay): " + str( dfn) + ", serbestlik derecesi (payda): " + str(dfd) return outputStr, yArray elif dfn <= 0 or dfd <= 0: return False, "Serbestlik dereceleri 0'dan kucuk olamaz." else: return False, "Gecerli olasilik degeri girilmelidir."
def sediff(sv1, sv2, n1, n2, alpha, bilateral=True): """ Calculate the interval estimation of the difference of 2 population variance (两个总体方差比的区间估计) sv1**2 pv2**2 ------ * -------- ~ F(n1-1, n2-1) sv2**2 pv1**2 Parameters ---------- sv1 : sample mean, 总体1的样本标准差 sv2 : sample mean, 总体2的样本标准差 n1 : sample count, 总体1的样本容量 n2 : sample count, 总体2的样本容量 alpha: confidence level, 置信水平 bilateral: 是否是双侧检验 Returns ------- tuple(pm1, pm2) : interval estimation of diff of population proportion, 两个总体方差比的区间估计 """ rv = st.f(n1 - 1, n2 - 1) f1 = rv.ppf((1 - alpha) / 2) f2 = rv.ppf((1 + alpha) / 2) svdiff = sv1**2 / sv2**2 # print(svsum) return tuple([svdiff / f2, svdiff / f1])
def calc_det_thresh(fstat_vals, det_p_val, TB_prod, channel_cnt, fstat_ref_peak=None): fstat_min = np.min(fstat_vals) fstat_max = np.max(fstat_vals) # compute reference threshold if not provided if fstat_ref_peak: fstat_peak = fstat_ref_peak else: def temp_fstat(f): return -stats.f(TB_prod, TB_prod * (channel_cnt - 1)).pdf(f) fstat_peak = minimize_scalar(temp_fstat, bracket=(fstat_min, fstat_max)).x # compute kde = stats.gaussian_kde(fstat_vals) def temp_kde(f): return -kde.pdf(f)[0] kde_peak = minimize_scalar(temp_kde, bracket=(fstat_min, fstat_max), options={ 'maxiter': 250 }).x return stats.f(TB_prod, TB_prod * (channel_cnt - 1)).ppf(det_p_val) * (kde_peak / fstat_peak)
def plotFDistribution(FCrit, FValue, dfModel, dfError): mu = 0 x = np.linspace(0, FValue + 2, 1001)[1:] fig, ax = plt.subplots(figsize=(5, 3.75)) dist = stats.f(dfModel, dfError, mu) plt.plot(x, dist.pdf(x), ls='-', c='black', label=r'$d_1=%i,\ d_2=%i$' % (dfModel, dfError)) plt.xlim(0, FValue + 2) plt.ylim(0.0, 1.0) plt.annotate('F Crit\n (%s)' % FCrit, xy=(FCrit, 0), xytext=(FCrit - 1, 0.4), arrowprops=dict(facecolor='red', shrink=0.05)) plt.annotate('F Value\n (%s)' % FValue, xy=(FValue, 0), xytext=(FValue - 2, 0.2), arrowprops=dict(facecolor='blue', shrink=0.05)) plt.xlabel('$x$') plt.ylabel(r'$p(x|d_1, d_2)$') plt.title("Fisher's Distribution") plt.legend() plt.show()
def PlotFDistributionDistributionFunction(dfn, dfd): if dfn>0 and dfd>0: main_frame = QtGui.QWidget() dpi = 100 fig = Figure((5.0, 4.0), dpi=dpi) canvas = FigureCanvas(fig) canvas.setParent(main_frame) axes = fig.add_subplot(111) mpl_toolbar = NavigationToolbar(canvas, main_frame) hbox = QtGui.QHBoxLayout() vbox = QtGui.QVBoxLayout() vbox.addWidget(canvas) vbox.addWidget(mpl_toolbar) vbox.addLayout(hbox) main_frame.setLayout(vbox) alpha = 0.0005 sequence = stats.f.isf(alpha, dfn, dfd) x = np.linspace(-sequence, sequence, 1000) rv = stats.f(dfn, dfd) y = rv.cdf(x) axes.plot(x,y) canvas.draw() return main_frame else: return False, "Serbestlik derecesi 0'dan kucuk olamaz." #---/F DISTRIBUTION
def FProbabilitiesLowerTail(values, dfn, dfd): if len(values)>0 and dfn>0 and dfd>0: outputStr = "" areas = [] for val in values: outputStr += str(val) rv = stats.f(dfn, dfd, loc = 0, scale=1) area = rv.cdf(val) area = "{0:.5f}".format(area) areas.append(area) if len(values) >1 and values.index(val) < len(values) - 1 : outputStr += ", " else: outputStr += "" outputStr += ", serbestlik derecesi (pay): " + str(dfn) + ", serbestlik derecesi (payda): " + str(dfd) return outputStr, areas elif dfn<=0 or dfd<=0: return False, "Serbestlik dereceleri 0'dan kucuk olamaz." else: return False, "Hesaplama icin gecerli degerler girilmelidir."
def PlotFDistributionDistributionFunction(dfn, dfd): if dfn > 0 and dfd > 0: main_frame = QtGui.QWidget() dpi = 100 fig = Figure((5.0, 4.0), dpi=dpi) canvas = FigureCanvas(fig) canvas.setParent(main_frame) axes = fig.add_subplot(111) mpl_toolbar = NavigationToolbar(canvas, main_frame) hbox = QtGui.QHBoxLayout() vbox = QtGui.QVBoxLayout() vbox.addWidget(canvas) vbox.addWidget(mpl_toolbar) vbox.addLayout(hbox) main_frame.setLayout(vbox) alpha = 0.0005 sequence = stats.f.isf(alpha, dfn, dfd) x = np.linspace(-sequence, sequence, 1000) rv = stats.f(dfn, dfd) y = rv.cdf(x) axes.plot(x, y) canvas.draw() return main_frame else: return False, "Serbestlik derecesi 0'dan kucuk olamaz." #---/F DISTRIBUTION
def param_table(results, title, pad_bottom=False): """Formatted standard parameter table""" param_data = np.c_[results.params.values[:, None], results.std_errors.values[:, None], results.tstats.values[:, None], results.pvalues.values[:, None], results.conf_int()] data = [] for row in param_data: txt_row = [] for i, v in enumerate(row): f = _str if i == 3: f = pval_format txt_row.append(f(v)) data.append(txt_row) header = [ 'Parameter', 'Std. Err.', 'T-stat', 'P-value', 'Lower CI', 'Upper CI' ] table_stubs = list(results.params.index) if pad_bottom: # Append blank row for spacing data.append([''] * 6) table_stubs += [''] return SimpleTable(data, stubs=table_stubs, txt_fmt=fmt_params, headers=header, title=title)
def three_sampling_dis(): """ 三大抽样分布与标准正态分布 :return: """ nor_dis = stats.norm() chi2_dis = stats.chi2(df=app.df1) t_dis = stats.t(df=app.df2) f_dis = stats.f(dfn=app.df3, dfd=app.df4) x1 = np.linspace(nor_dis.ppf(0.001), nor_dis.ppf(0.999), 1000) x2 = np.linspace(chi2_dis.ppf(0.001), chi2_dis.ppf(0.999), 1000) x3 = np.linspace(t_dis.ppf(0.001), t_dis.ppf(0.999), 1000) x4 = np.linspace(f_dis.ppf(0.001), f_dis.ppf(0.999), 1000) fig, ax = plt.subplots(1, 1, figsize=(16, 8)) ax.plot(x1, nor_dis.pdf(x1), 'r-', lw=2, label=r'N(0, 1)') ax.plot(x2, chi2_dis.pdf(x2), 'g-', lw=2, label=f'$\chi^2$({app.df1})') ax.plot(x3, t_dis.pdf(x3), 'b-', lw=2, label=f't({app.df2})') ax.plot(x4, f_dis.pdf(x4), 'm-', lw=2, label=f'F({app.df3}, {app.df4 * 2})') plt.ylabel('Probability') plt.title(r'PDF of Three Sampling Distribution') ax.legend(loc='best', frameon=False) plt.show()
def calculateSampleData(self, data): self.l_mean = data.groupby(level=0).mean().T.mean() self.l_ss = ((self.l_mean - self.total_mean) ** 2).sum() * self.r * self.t self.l_ms = self.l_ss / (self.s - 1) self.l_f = self.l_ms / self.e_ms self.l_f_distribute = f(self.s - 1, self.r * self.s * (self.t - 1)) self.l_p = self.l_f_distribute.sf(self.l_f)
def __init__(self, d1, d2): self.d1 = d1 self.d2 = d2 # set dist before calling super's __init__ self.dist = st.f(d1, d2) super(F, self).__init__()
def anova_byHand(): """ Calculate the ANOVA by hand. While you would normally not do that, this function shows how the underlying values can be calculated. """ # Get the data data = getData("altman_910.txt", subDir="..\Data\data_altman") # Convert them to pandas-forman and group them by their group value df = pd.DataFrame(data, columns=["values", "group"]) groups = df.groupby("group") # The "total sum-square" is the squared deviation from the mean ss_total = np.sum((df["values"] - df["values"].mean()) ** 2) # Calculate ss_treatment and ss_error (ss_treatments, ss_error) = (0, 0) for val, group in groups: ss_error += sum((group["values"] - group["values"].mean()) ** 2) ss_treatments += len(group) * (group["values"].mean() - df["values"].mean()) ** 2 df_groups = len(groups) - 1 df_residuals = len(data) - len(groups) F = (ss_treatments / df_groups) / (ss_error / df_residuals) df = stats.f(df_groups, df_residuals) p = df.sf(F) print(("ANOVA-Results: F = {0}, and p<{1}".format(F, p))) return (F, p)
def anova_byHand(): """Calculate the ANOVA by hand""" # Get the data data = getData('altman_910.txt', subDir='..\Data\data_altman') # Convert them to pandas-forman and group them by their group value df = pd.DataFrame(data, columns=['values', 'group']) groups = df.groupby('group') # The "total sum-square" is the squared deviation from the mean ss_total = np.sum((df['values'] - df['values'].mean())**2) # Calculate ss_treatment and ss_error (ss_treatments, ss_error) = (0, 0) for val, group in groups: ss_error += sum((group['values'] - group['values'].mean())**2) ss_treatments += len(group) * ( group['values'].mean() - df['values'].mean())**2 df_groups = len(groups) - 1 df_residuals = len(data) - len(groups) F = (ss_treatments / df_groups) / (ss_error / df_residuals) df = stats.f(df_groups, df_residuals) p = df.sf(F) print('ANOVA-Results: F = {0}, and p<{1}'.format(F, p)) return (F, p)
def FQuantilesLowerTail(probs,dfn,dfd): if len(probs)>0 and dfn>0 and dfd>0: outputStr = "" yArray = [] for prob in probs: outputStr += str(prob) if prob> 0 and prob<1: rv = stats.f(dfn,dfd, loc = 0, scale = 1) y = rv.ppf(prob) y = "{0:.5f}".format(y) yArray.append(y) else: yArray.append("NaN") if len(probs) >1 and probs.index(prob) < len(probs) - 1 : outputStr += ", " else: outputStr += "" outputStr += ", serbestlik derecesi (pay): " + str(dfn) + ", serbestlik derecesi (payda): " + str(dfd) return outputStr, yArray elif dfn<=0 or dfd <=0: return False, "Serbestlik dereceleri 0'dan kucuk olamaz." else: return False, "Gecerli olasilik degeri girilmelidir."
def tt(A, B): f_p = f(A, B).pvalue if f_p <= 0.05: t_p = ttest(A, B, equal_var=False).pvalue elif f_p > 0.05: t_p = ttest(A, B, equal_var=True).pvalue return t_p
def __init__(self, groups): k = groups.num_groups() n = groups.n() dist = stats.f(k - 1, n - k) super(LinearContrastHyp, self).__init__(dist=dist, kind=AltHypKind.TWO_SIDED) self._groups = groups
def statistic(self, alpha=0.05): x = self.train_x[:, 1] y = self.train_y y_pred = self.predict(self.train_x, add_const=False) k = self.num_features n = self.num_samples SSE = np.sum((y - y_pred)**2) SST = np.sum((y - y.mean()) * y) SSR = np.sum((y_pred - y_pred.mean())**2) sigma_e = SSE / (n - k - 1) F_test = SSR * (n - k - 1) / SSE / k F_q = stats.f(k, n - k - 1).ppf(1 - alpha) test_result = 'NO SIGNIFICANT LINEAR DEPENDENCE!' if F_test < F_q else 'SIGNIFICANT LINEAR DEPENDENCE!' print_list = [('k', k), ('n', n), ('SSE', SSE), ('SSR', SSR), ('SST', SST), ('sigma_e', sigma_e)] if k == 1: Lxx = np.sum((x - x.mean()) * x) Lyy = SST Lxy = np.sum((y - y.mean()) * x) print_list.extend([('Lxx', Lxx), ('Lxy', Lxy), ('Lyy', Lyy)]) print_list.extend([('F_test', F_test), ('F_q', F_q), ('test_result', test_result)]) print('=' * 30, 'statistics', '=' * 30) utils.pair_print(print_list) print('=' * (len('statistics') + 62), end='\n\n')
def test_Normal_to_F(self): A, B, C, V, W, X, Y, Z = RV(Normal(mean=0, var=1)**8) sims = ((((A**2) + (B**2) + (C**2)) / 3) / (((V**2) + (W**2) + (X**2) + (Y**2) + (Z**2)) / 5)).sim(Nsim) cdf = stats.f(dfn=3, dfd=5).cdf pval = stats.kstest(sims, cdf).pvalue self.assertTrue(pval > .01)
def fix_alpha(alpha, Sigma, Sigma_star): p = Sigma.shape[0] Sigma_12 = fractional_matrix_power(Sigma, 0.5) matrix = Sigma_12.T @ np.linalg.inv(Sigma_star) @ Sigma_12 lambdas = np.real(np.linalg.eigvals(matrix)) factorials = [1, 1, 2, 8] k = np.asarray([factorials[r] * np.sum(lambdas**r) for r in [1,1,2,3]]) t1 = 4*k[1]*k[2]**2 + k[3]*(k[2]-k[1]**2) t2 = k[3]*k[1] - 2*k[2]**2 chi_quantile = sps.chi2(p).ppf(1-alpha) if t1 < 10**(-5): a_new = 2 + (k[1]**2)/(k[2]**2) b_new = (k[1]**3)/k[2] + k[1] s1 = 2*k[1]*(k[3]*k[1] + k[2]*k[1]**2 - k[2]**2) s2 = 3*t2 + 2*k[2]*(k[2] + k[1]**2) alpha_star = 1 - sps.invgamma(a_new, scale = b_new).cdf(chi_quantile) elif t2 < 10**(-5): a_new = (k[1]**2)/k[2] b_new = k[2]/k[1] alpha_star = 1 - sps.gamma(a_new, scale = b_new).cdf(chi_quantile) else: a1 = 2*k[1]*(k[3]*k[1] + k[2]*k[1]**2 - k[2]**2)/t1 a2 = 3 + 2*k[2]*(k[2] + k[1]**2)/t2 alpha_star = 1 - sps.f(2*a1, 2*a2).cdf(a2*t2*chi_quantile/(a1*t1)) return alpha_star
def anova_byHand(): """ Calculate the ANOVA by hand. While you would normally not do that, this function shows how the underlying values can be calculated. """ # Get the data inFile = 'altman_910.txt' data = np.genfromtxt(inFile, delimiter=',') # Convert them to pandas-forman and group them by their group value df = pd.DataFrame(data, columns=['values', 'group']) groups = df.groupby('group') # The "total sum-square" is the squared deviation from the mean ss_total = np.sum((df['values']-df['values'].mean())**2) # Calculate ss_treatment and ss_error (ss_treatments, ss_error) = (0, 0) for val, group in groups: ss_error += sum((group['values'] - group['values'].mean())**2) ss_treatments += len(group) * (group['values'].mean() - df['values'].mean())**2 df_groups = len(groups)-1 df_residuals = len(data)-len(groups) F = (ss_treatments/df_groups) / (ss_error/df_residuals) df = stats.f(df_groups,df_residuals) p = df.sf(F) print(('ANOVA-Results: F = {0}, and p<{1}'.format(F, p))) return (F, p)
def LoF_test(N, X, Y, alpha=0.05): ''' Test for Lack of Fit. H0: the linear regression model is appropriate. In slides 608. REQUIRE: multiple sampling for single x. N, X are 1-D lists. Y is a 2-D list. RETURN: (SSE, SSE_pe, SSE_if), (Test statistics and critical value). ''' k = len(N) n = sum(N) print(n, k) mean_Y = [sum(Y[i]) / N[i] for i in range(k)] SSE_pe = sum( sum([(Y[i][j] - mean_Y[i])**2 for j in range(N[i])]) for i in range(k)) x = [] for i in range(k): x.extend([X[i]] * N[i]) y = [] for i in range(k): y.extend(Y[i]) model = SLR(n, x, y) SSE = model.SSE SSE_if = SSE - SSE_pe F = (SSE_if / (k - 2)) / (SSE_pe / (n - k)) f = stats.f(k - 2, n - k).ppf(1 - alpha) return (SSE, SSE_pe, SSE_if), (F, f)
def fun7(): print("open三大抽样分布") #绘制 正态分布 卡方分布 t分布 F分布 nor_dis = stats.norm() chi2_dis = stats.chi2(df=eval(k_1.get())) t_dis = stats.t(df=eval(t_1.get())) f_dis = stats.f(dfn=eval(f_1.get()), dfd=eval(f_2.get())) x1 = np.linspace(nor_dis.ppf(0.001), nor_dis.ppf(0.999), 1000) x2 = np.linspace(chi2_dis.ppf(0.001), chi2_dis.ppf(0.999), 1000) x3 = np.linspace(t_dis.ppf(0.001), t_dis.ppf(0.999), 1000) x4 = np.linspace(f_dis.ppf(0.001), f_dis.ppf(0.999), 1000) fig, ax = plt.subplots(1, 1, figsize=(16, 8)) ax.plot(x1, nor_dis.pdf(x1), 'r-', lw=2, label=r'N(0, 1)') ax.plot(x2, chi2_dis.pdf(x2), 'g-', lw=2, label=r'$\chi^2$(%d)' % eval(k_1.get())) ax.plot(x3, t_dis.pdf(x3), 'b-', lw=2, label='t(%d)' % eval(t_1.get())) ax.plot(x4, f_dis.pdf(x4), 'm-', lw=2, label='F(%d, %d)' % (eval(f_1.get()), eval(f_2.get()))) plt.xlabel("x") plt.ylabel('Probability') plt.title(r'PDF of Three Sampling Distribution') ax.legend(loc='best', frameon=False) plt.grid() plt.show()
def f_threshold_twoway_rm(n_subjects, factor_levels, effects='A*B', pvalue=0.05): """ Compute f-value thesholds for a two-way ANOVA Parameters ---------- n_subjects : int The number of subjects to be analyzed. factor_levels : list-like The number of levels per factor. effects : str A string denoting the effect to be returned. The following mapping is currently supported: 'A': main effect of A 'B': main effect of B 'A:B': interaction effect 'A+B': both main effects 'A*B': all three effects pvalue : float The p-value to be thresholded. Returns ------- f_threshold : list | float list of f-values for each effect if the number of effects requested > 2, else float. """ effect_picks = _check_effects(effects) f_threshold = [] for _, df1, df2 in _iter_contrasts(n_subjects, factor_levels, effect_picks): f_threshold.append(stats.f(df1, df2).isf(pvalue)) return f_threshold if len(f_threshold) > 1 else f_threshold[0]
def anova_byHand(): """ Calculate the ANOVA by hand. While you would normally not do that, this function shows how the underlying values can be calculated. """ # Get the data data = getData('altman_910.txt', subDir='.') # Convert them to pandas-forman and group them by their group value df = pd.DataFrame(data, columns=['values', 'group']) groups = df.groupby('group') # The "total sum-square" is the squared deviation from the mean ss_total = np.sum((df['values'] - df['values'].mean())**2) # Calculate ss_treatment and ss_error (ss_treatments, ss_error) = (0, 0) for val, group in groups: ss_error += sum((group['values'] - group['values'].mean())**2) ss_treatments += len(group) * (group['values'].mean() - df['values'].mean())**2 df_groups = len(groups) - 1 df_residuals = len(data) - len(groups) F = (ss_treatments / df_groups) / (ss_error / df_residuals) df = stats.f(df_groups, df_residuals) p = df.sf(F) print(('ANOVA-Results: F = {0}, and p<{1}'.format(F, p))) return (F, p)
def calculateColumnData(self, data): self.c_mean = data.mean() self.c_ss = ((self.c_mean - self.total_mean) ** 2).sum() * self.s * self.t self.c_ms = self.c_ss / (self.r - 1) self.c_f = self.c_ms / self.e_ms self.c_f_distribute = f(self.r - 1, self.r * self.s * (self.t - 1)) self.c_p = self.c_f_distribute.sf(self.c_f)
def p_value(t2): ''' Calculate the p-value of the F distribution at t2 ''' T2 = (sample_size-dimension)/(dimension*(sample_size-1)) * t2 f = stats.f( dimension, sample_size-dimension) return f.cdf(T2)
def generate_matrix(): def f(X1, X2, X3): from random import randrange y = 8.4+8.5*x1+5.7*x2+9.7*x3+8.9*x1*x1+0.2*x2*x2+0.5*x3*x3+2.0*x1*x2+0.7*x1*x3+4.3*x2*x3+9.7*x1*x2*x3 + randrange(0, 10) - 5 return y matrix_with_y = [[f(matrix_x[j][0], matrix_x[j][1], matrix_x[j][2]) for i in range(m)] for j in range(N)] return matrix_with_y
def calculateCorrData(self, data): l_mean = pd.concat([self.l_mean] * self.r, axis=1, keys=self.corr_mean.columns) c_mean = pd.concat([self.c_mean] * self.s, axis=1, keys=self.corr_mean.index).T self.corr_ss = ((self.corr_mean - l_mean - c_mean + self.total_mean) ** 2).sum().sum() * self.t self.corr_ms = self.corr_ss / ((self.s - 1) * (self.r - 1)) self.corr_f = self.corr_ms / self.e_ms self.corr_f_distribute = f((self.s - 1) * (self.r - 1), self.r * self.s * (self.t - 1)) self.corr_p = self.corr_f_distribute.sf(self.corr_f)
def Chow_Dickey(pd_series, split, reg_type='c', auto_lag='AIC', max_lag=4, verbose=False): if isinstance(pd_series, pd.DataFrame): temp_index = pd_series.index pd_series = pd.Series(pd_series, index=temp_index) if isinstance(split, pd._libs.tslib.Timestamp) | isinstance(split, str): mask = pd_series.index <= pd.to_datetime(split) split1 = pd_series[mask] split2 = pd_series[~mask] else: mask = pd_series.index.year <= split split1 = pd_series[mask] split2 = pd_series[~mask] nr_adf = adfuller(pd_series, regression=reg_type, autolag=auto_lag, maxlag=max_lag, regresults=True)[3] nr_lag = nr_adf.usedlag nr_model = nr_adf.resols nr_ssr = nr_model.ssr * nr_model.nobs param_length = nr_model.df_model + 1 adf1 = adfuller(split1, regression=reg_type, autolag=None, maxlag=max_lag, regresults=True)[3] adf1_model = adf1.resols N1 = adf1_model.nobs adf1_ssr = adf1_model.ssr * N1 adf2 = adfuller(split2, regression=reg_type, autolag=None, maxlag=max_lag, regresults=True)[3] adf2_model = adf2.resols N2 = adf2_model.nobs adf2_ssr = adf2_model.ssr * N2 numerator = (nr_ssr - (adf1_ssr + adf2_ssr)) / param_length denominator = (adf1_ssr + adf2_ssr) / (N1 + N2 - 2 * param_length) F_stat = numerator / denominator f_dist = stat.f(param_length, N1 + N2 - 2 * param_length) p_val = 1 - f_dist.cdf(F_stat) return F_stat, p_val, nr_lag
def show_continuous(): """Show a variety of continuous distributions""" x = linspace(-10,10,201) # Normal distribution showDistribution(x, stats.norm, stats.norm(loc=2, scale=4), 'Normal Distribution', 'Z', 'P(Z)','') # Exponential distribution showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4), 'Exponential Distribution', 'X', 'P(X)','') # Students' T-distribution # ... with 4, and with 10 degrees of freedom (DOF) plot(x, stats.norm.pdf(x), 'g') hold(True) showDistribution(x, stats.t(4), stats.t(10), 'T-Distribution', 'X', 'P(X)',['normal', 't=4', 't=10']) # F-distribution # ... with (3,4) and (10,15) DOF showDistribution(x, stats.f(3,4), stats.f(10,15), 'F-Distribution', 'F', 'P(F)',['(3,4) DOF', '(10,15) DOF']) # Uniform distribution showDistribution(x, stats.uniform,'' , 'Uniform Distribution', 'X', 'P(X)','') # Logistic distribution showDistribution(x, stats.norm, stats.logistic, 'Logistic Distribution', 'X', 'P(X)',['Normal', 'Logistic']) # Lognormal distribution x = logspace(-9,1,1001)+1e-9 showDistribution(x, stats.lognorm(2), '', 'Lognormal Distribution', 'X', 'lognorm(X)','', xmin=-0.1) # The log-lin plot has to be done by hand: plot(log(x), stats.lognorm.pdf(x,2)) xlim(-10, 4) title('Lognormal Distribution') xlabel('log(X)') ylabel('lognorm(X)') show()
def generate_matrix(): def f(X1, X2, X3): from random import randrange y = 7.7 + 2.8 * X1 + 0.5 * X2 + 2.6 * X3 + 1.4 * X1 * X1 + 0.3 * X2 * X2 + 7.1 * X3 * X3 + 5.0 * X1 * X2 + \ 0.3 * X1 * X3 + 9.3 * X2 * X3 + 4.1 * X1 * X2 * X3 + randrange(0, 10) - 5 return y matrix_with_y = [[f(matrix_x[j][0], matrix_x[j][1], matrix_x[j][2]) for i in range(m)] for j in range(N)] return matrix_with_y
def generate_matrix(): def f(X1, X2, X3): from random import randrange y = 6.7 + 9.1 * X1 + 1.6 * X2 + 9.1 * X3 + 3.3 * X1 * X1 + 0.2 * X2 * X2 + 6.1 * X3 * X3 + 8.5 * X1 * X2 + \ 0.7 * X1 * X3 + 6.6 * X2 * X3 + 8.1 * X1 * X2 * X3 + randrange(0, 10) - 5 return y matrix_with_y = [[f(matrix_x[j][0], matrix_x[j][1], matrix_x[j][2]) for i in range(m)] for j in range(N)] return matrix_with_y
def generate_matrix(): def f(X1, X2, X3): """Генерація функції по варіанту""" y = (5.6 + 8.0 * X1 + 4.8 * X2 + 6.2 * X3 + 5.9 * X1 * X1 + 1.0 * X2 * X2 + 8.7 * X3 * X3 + 2.0 * X1 * X2 + \ 0.8 * X1 * X3 + 1.0 * X2 * X3 + 3.0 * X1 * X2 * X3 + randrange(0, 10) - 5) return y matrix_with_y = [[f(matrix_x[j][0], matrix_x[j][1], matrix_x[j][2]) for i in range(m)] for j in range(N)] return matrix_with_y
def generate_matrix(): def f(X1, X2, X3): from random import randrange y = 3.5 + 6.6 * X1 + 5.3 * X2 + 5.0 * X3 + 5.1 * X1 * X1 + 0.1 * X2 * X2 + 7.2 * X3 * X3 + 1.4 * X1 * X2 \ + 0.7 * X1 * X3 + 4.2 * X2 * X3 + 7.7 * X1 * X2 * X3 + randrange(0, 10) - 5 return y matrix_with_y = [[f(matrix_x[j][0], matrix_x[j][1], matrix_x[j][2]) for _ in range(m)] for j in range(N)] return matrix_with_y
def anova(arr): a = seasonal_matrix(arr) tau = a.shape[0] zbar = a.mean(0) v_zbar = zbar.var() v = a.ravel().var() stat = m * (tau - 1) / (m - 1) * v_zbar / (v - v_zbar) i_a = stat > stats.f(m - 1, m * (tau - 1)).ppf(.9) return i_a
def generate_matrix(): def f(X1, X2, X3): # my function y = 5.4 + 3.6 * X1 + 6.6 * X2 + 7.7 * X3 + 8.0 * X1 * X1 + 0.3 * X2 * X2 + 2.5 * X3 * X3 + 5.9 * X1 * X2 + \ 0.3 * X1 * X3 + 7.2 * X2 * X3 + 5.3 * X1 * X2 * X3 + randrange(0, 10) - 5 return y matrix_with_y = [[f(matrix_x[j][0], matrix_x[j][1], matrix_x[j][2]) for i in range(m)] for j in range(N)] return matrix_with_y
def __init__(self, data): self.r = data.shape[1] self.k = data.shape[0] self.calculate_error_value(data) columnAnalysis = SingleAnalysisVariance(data) self.c_ss = columnAnalysis.between_group_ss self.c_ms = columnAnalysis.between_group_ss / (self.r - 1) self.c_f = self.c_ms / self.e_ms self.c_statistics_info = columnAnalysis.statistics_info self.c_f_distribute = f(self.r - 1, (self.r - 1) * (self.k - 1)) self.c_p_value = self.c_f_distribute.sf(self.c_f) lineAnalysis = SingleAnalysisVariance(data.T) self.l_ss = lineAnalysis.between_group_ss self.l_ms = lineAnalysis.between_group_ss / (self.k - 1) self.l_f = self.l_ms / self.e_ms self.l_statistics_info = lineAnalysis.statistics_info self.l_f_distribute = f(self.k - 1, (self.r - 1) * (self.k - 1)) self.l_p_value = self.l_f_distribute.sf(self.l_f)
def generate_matrix(): def f(X1, X2, X3): y = 0.3 + 4.1 * X1 + 2.8 * X2 + 7.8 * X3 + 1.4 * X1 * X1 + 0.2 * X2 * X2 + 2.4 * X3 * X3 + 9.7 * X1 * X2 + \ 0.6 * X1 * X3 + 4.4 * X2 * X3 + 3.4 * X1 * X2 * X3 + randrange(0, 10) - 5 return y matrix_y = [[ f(matrix_x[j][0], matrix_x[j][1], matrix_x[j][2]) for i in range(m) ] for j in range(N)] return matrix_y
def F(d1, d2, tag=None): """ An F (fisher) random variate Parameters ---------- d1 : int Numerator degrees of freedom d2 : int Denominator degrees of freedom """ assert isinstance(d1, int) and d1>1, 'd1 must be an int greater than 1' assert isinstance(d2, int) and d2>1, 'd2 must be an int greater than 1' return uv(rv=ss.f(d1, d2), tag=tag)
def Fisher(d1, d2, tag=None): """ An F (fisher) random variate Parameters ---------- d1 : int Numerator degrees of freedom d2 : int Denominator degrees of freedom """ assert int(d1)==d1 and d1>=1, 'Fisher (F) "d1" must be an integer greater than 0' assert int(d2)==d2 and d2>=1, 'Fisher (F) "d2" must be an integer greater than 0' return uv(ss.f(d1, d2), tag=tag)
def __init__(self): self.dist_equivalents = [ #transf, stats.lognorm(1)) (lognormalg, stats.lognorm(1)), #transf2 (squarenormalg, stats.chi2(1)), (absnormalg, stats.halfnorm), (absnormalg, stats.foldnorm(1e-5)), #try frozen #(negsquarenormalg, 1-stats.chi2), # won't work as distribution (squaretg(10), stats.f(1, 10))] #try both frozen l,s = 0.0, 1.0 self.ppfq = [0.1,0.5,0.9] self.xx = [0.95,1.0,1.1] self.nxx = [-0.95,-1.0,-1.1]
def f_threshold_mway_rm(n_subjects, factor_levels, effects='A*B', pvalue=0.05): """Compute F-value thresholds for a two-way ANOVA. Parameters ---------- n_subjects : int The number of subjects to be analyzed. factor_levels : list-like The number of levels per factor. effects : str A string denoting the effect to be returned. The following mapping is currently supported: * ``'A'``: main effect of A * ``'B'``: main effect of B * ``'A:B'``: interaction effect * ``'A+B'``: both main effects * ``'A*B'``: all three effects pvalue : float The p-value to be thresholded. Returns ------- F_threshold : list | float list of F-values for each effect if the number of effects requested > 2, else float. See Also -------- f_oneway f_mway_rm Notes ----- .. versionadded:: 0.10 """ from scipy.stats import f effect_picks, _ = _map_effects(len(factor_levels), effects) F_threshold = [] for _, df1, df2 in _iter_contrasts(n_subjects, factor_levels, effect_picks): F_threshold.append(f(df1, df2).isf(pvalue)) return F_threshold if len(F_threshold) > 1 else F_threshold[0]
def setup_class(cls): cls.dist_equivalents = [ #transf, stats.lognorm(1)) #The below fails on the SPARC box with scipy 10.1 #(lognormalg, stats.lognorm(1)), #transf2 (squarenormalg, stats.chi2(1)), (absnormalg, stats.halfnorm), (absnormalg, stats.foldnorm(1e-5)), #try frozen #(negsquarenormalg, 1-stats.chi2), # won't work as distribution (squaretg(10), stats.f(1, 10)) ] #try both frozen l,s = 0.0, 1.0 cls.ppfq = [0.1,0.5,0.9] cls.xx = [0.95,1.0,1.1] cls.nxx = [-0.95,-1.0,-1.1]
def f_oneway(self, *args): if args[0]==None: return [None,None] result=[] n=len(args)*len(args[0]) m=len(args) fS=m-1 fe=n-m SA=self.getSA(*args) Se=self.getSe(*args) VA=SA/fS Ve=Se/fe FA=VA/Ve F=f(fS,fe) p=F.sf(FA) result.append(float('%.6f'%FA)) result.append(float('%.6f'%p)) return [float('%.6f'%FA),float('%.6f'%p)]
def __init__(self, data): """ :param data: the data to analysis, it'a a DataFrame """ self.n = data.notnull().sum().sum() self.k = data.shape[1] self.statistics_info = self._create_statistics_info(data) self.total_mean = self.statistics_info["sum"].sum() / self.statistics_info["count"].sum() self.between_group_ss = ( self.statistics_info["count"] * (self.statistics_info["mean"] - self.total_mean) ** 2 ).sum() self.ms_between_group = self.between_group_ss / (self.k - 1) self.inside_group_ss = self.statistics_info["sumdiff"].sum() self.ms_inside_group = self.inside_group_ss / (self.n - self.k) self.f = self.ms_between_group / self.ms_inside_group self.f_distribute = f(self.k - 1, self.n - self.k) self.p_value = self.f_distribute.sf(self.f) self.t_distribute = t(self.n - self.k)
def understand_f_fitting(sigma2): """ Test function: Understanding the F scaled fitting procedure. """ import matplotlib.pyplot as plt prms = st.f.fit(sigma2, f0=19)#, floc=0) print prms dfn = prms[0] dfd = prms[1] scale_ = prms[3] loc_ = prms[2] x = np.linspace(st.f.ppf(0.01, dfn, dfd, scale=scale_, loc=loc_), st.f.ppf(0.99, dfn, dfd, scale=scale_, loc=loc_), 100) rv = st.f(dfn, dfd, scale=scale_)#, loc=loc_) plt.plot(x, rv.pdf(x), color='#ee9041', lw=2) h = plt.hist(sigma2, normed=True, color='#459db9')
def f_mway_rm(data, factor_levels, effects='all', correction=False, return_pvals=True): """Compute M-way repeated measures ANOVA for fully balanced designs. Parameters ---------- data : ndarray 3D array where the first two dimensions are compliant with a subjects X conditions scheme where the first factor repeats slowest:: A1B1 A1B2 A2B1 A2B2 subject 1 1.34 2.53 0.97 1.74 subject ... .... .... .... .... subject k 2.45 7.90 3.09 4.76 The last dimensions is thought to carry the observations for mass univariate analysis. factor_levels : list-like The number of levels per factor. effects : str | list A string denoting the effect to be returned. The following mapping is currently supported (example with 2 factors): * ``'A'``: main effect of A * ``'B'``: main effect of B * ``'A:B'``: interaction effect * ``'A+B'``: both main effects * ``'A*B'``: all three effects * ``'all'``: all effects (equals 'A*B' in a 2 way design) If list, effect names are used: ``['A', 'B', 'A:B']``. correction : bool The correction method to be employed if one factor has more than two levels. If True, sphericity correction using the Greenhouse-Geisser method will be applied. return_pvals : bool If True, return p-values corresponding to F-values. Returns ------- F_vals : ndarray An array of F-statistics with length corresponding to the number of effects estimated. The shape depends on the number of effects estimated. p_vals : ndarray If not requested via return_pvals, defaults to an empty array. See Also -------- f_oneway f_threshold_mway_rm Notes ----- .. versionadded:: 0.10 """ from scipy.stats import f if data.ndim == 2: # general purpose support, e.g. behavioural data data = data[:, :, np.newaxis] elif data.ndim > 3: # let's allow for some magic here. data = data.reshape( data.shape[0], data.shape[1], np.prod(data.shape[2:])) effect_picks, _ = _map_effects(len(factor_levels), effects) n_obs = data.shape[2] n_replications = data.shape[0] # put last axis in front to 'iterate' over mass univariate instances. data = np.rollaxis(data, 2) fvalues, pvalues = [], [] for c_, df1, df2 in _iter_contrasts(n_replications, factor_levels, effect_picks): y = np.dot(data, c_) b = np.mean(y, axis=1)[:, np.newaxis, :] ss = np.sum(np.sum(y * b, axis=2), axis=1) mse = (np.sum(np.sum(y * y, axis=2), axis=1) - ss) / (df2 / df1) fvals = ss / mse fvalues.append(fvals) if correction: # sample covariances, leave off "/ (y.shape[1] - 1)" norm because # it falls out. v = np.array([np.dot(y_.T, y_) for y_ in y]) v = (np.array([np.trace(vv) for vv in v]) ** 2 / (df1 * np.sum(np.sum(v * v, axis=2), axis=1))) eps = v df1, df2 = np.zeros(n_obs) + df1, np.zeros(n_obs) + df2 if correction: # numerical imprecision can cause eps=0.99999999999999989 # even with a single category, so never let our degrees of # freedom drop below 1. df1, df2 = [np.maximum(d[None, :] * eps, 1.) for d in (df1, df2)] if return_pvals: pvals = f(df1, df2).sf(fvals) else: pvals = np.empty(0) pvalues.append(pvals) # handle single effect returns return [np.squeeze(np.asarray(vv)) for vv in (fvalues, pvalues)]
def FandPV(df1, df2, fval): rv = _ss.f(df1, df2) return 1 - rv.cdf(fval)
def f_twoway_rm(data, factor_levels, effects='A*B', alpha=0.05, correction=False, return_pvals=True): """ 2 way repeated measures ANOVA for fully balanced designs data : ndarray 3D array where the first two dimensions are compliant with a subjects X conditions scheme: first factor repeats slowest: A1B1 A1B2 A2B1 B2B2 subject 1 1.34 2.53 0.97 1.74 subject ... .... .... .... .... subject k 2.45 7.90 3.09 4.76 The last dimensions is thought to carry the observations for mass univariate analysis. factor_levels : list-like The number of levels per factor. effects : str A string denoting the effect to be returned. The following mapping is currently supported: 'A': main effect of A 'B': main effect of B 'A:B': interaction effect 'A+B': both main effects 'A*B': all three effects alpha : float The significance threshold. correction : bool The correction method to be employed if one factor has more than two levels. If True, sphericity correction using the Greenhouse-Geisser method will be applied. return_pvals : bool If True, return p values corresponding to f values. Returns ------- f_vals : ndarray An array of f values with length corresponding to the number of effects estimated. The shape depends on the number of effects estimated. p_vals : ndarray If not requested via return_pvals, defaults to an empty array. """ if data.ndim == 2: # general purpose support, e.g. behavioural data data = data[:, :, np.newaxis] elif data.ndim > 3: # let's allow for some magic here. data = data.reshape(data.shape[0], data.shape[1], np.prod(data.shape[2:])) effect_picks = _check_effects(effects) n_obs = data.shape[2] n_replications = data.shape[0] # pute last axis in fornt to 'iterate' over mass univariate instances. data = np.rollaxis(data, 2) fvalues, pvalues = [], [] for c_, df1, df2 in _iter_contrasts(n_replications, factor_levels, effect_picks): y = np.dot(data, c_) b = np.mean(y, axis=1)[:, np.newaxis, :] ss = np.sum(np.sum(y * b, axis=2), axis=1) mse = (np.sum(np.sum(y * y, axis=2), axis=1) - ss) / (df2 / df1) fvals = ss / mse fvalues.append(fvals) if correction: # sample covariances, leave off "/ (y.shape[1] - 1)" norm because # it falls out. the below line is faster than the equivalent: # v = np.array([np.dot(y_.T, y_) for y_ in y]) v = np.array(map(np.dot, y.swapaxes(2, 1), y)) v = (np.array(map(np.trace, v)) ** 2 / (df1 * np.sum(np.sum(v * v, axis=2), axis=1))) eps = v df1, df2 = np.zeros(n_obs) + df1, np.zeros(n_obs) + df2 if correction: df1, df2 = [d[None, :] * eps for d in df1, df2] if return_pvals: pvals = stats.f(df1, df2).sf(fvals) else: pvals = np.empty(0) pvalues.append(pvals) # handle single effect returns return [np.squeeze(np.asarray(v)) for v in fvalues, pvalues]
plt.figure(1) plt.plot(support[ix], rv.pdf(support[ix]), label='Actual') plt.plot(support[ix], dens_normal.pdf()[ix], label='Scott') plt.plot(support[ix], dens_cvls.pdf()[ix], label='CV_LS') plt.plot(support[ix], dens_cvml.pdf()[ix], label='CV_ML') plt.title("Nonparametric Estimation of the Density of Beta Distributed " \ "Random Variable") plt.legend(('Actual', 'Scott', 'CV_LS', 'CV_ML')) # f distribution df = 100 dn = 100 nobs = 250 support = np.random.f(dn, df, size=nobs) rv = stats.f(df, dn) ix = np.argsort(support) dens_normal = KDEMultivariate(data=[support], var_type='c', bw='normal_reference') dens_cvls = KDEMultivariate(data=[support], var_type='c', bw='cv_ls') dens_cvml = KDEMultivariate(data=[support], var_type='c', bw='cv_ml') plt.figure(2) plt.plot(support[ix], rv.pdf(support[ix]), label='Actual') plt.plot(support[ix], dens_normal.pdf()[ix], label='Scott') plt.plot(support[ix], dens_cvls.pdf()[ix], label='CV_LS') plt.plot(support[ix], dens_cvml.pdf()[ix], label='CV_ML') plt.title("Nonparametric Estimation of the Density of f Distributed " \ "Random Variable") plt.legend(('Actual', 'Scott', 'CV_LS', 'CV_ML'))
def test_causality(self, equation, variables, kind='f', signif=0.05, verbose=True): """Compute test statistic for null hypothesis of Granger-noncausality, general function to test joint Granger-causality of multiple variables Parameters ---------- equation : string or int Equation to test for causality variables : sequence (of strings or ints) List, tuple, etc. of variables to test for Granger-causality kind : {'f', 'wald'} Perform F-test or Wald (chi-sq) test signif : float, default 5% Significance level for computing critical values for test, defaulting to standard 0.95 level Notes ----- Null hypothesis is that there is no Granger-causality for the indicated variables. The degrees of freedom in the F-test are based on the number of variables in the VAR system, that is, degrees of freedom are equal to the number of equations in the VAR times degree of freedom of a single equation. Returns ------- results : dict """ if isinstance(variables, (basestring, int, np.integer)): variables = [variables] k, p = self.neqs, self.k_ar # number of restrictions N = len(variables) * self.k_ar # Make restriction matrix C = np.zeros((N, k ** 2 * p + k), dtype=float) eq_index = self.get_eq_index(equation) vinds = mat([self.get_eq_index(v) for v in variables]) # remember, vec is column order! offsets = np.concatenate([k + k ** 2 * j + k * vinds + eq_index for j in range(p)]) C[np.arange(N), offsets] = 1 # Lutkepohl 3.6.5 Cb = np.dot(C, vec(self.params.T)) middle = L.inv(chain_dot(C, self.cov_params, C.T)) # wald statistic lam_wald = statistic = chain_dot(Cb, middle, Cb) if kind.lower() == 'wald': df = N dist = stats.chi2(df) elif kind.lower() == 'f': statistic = lam_wald / N df = (N, k * self.df_resid) dist = stats.f(*df) else: raise Exception('kind %s not recognized' % kind) pvalue = dist.sf(statistic) crit_value = dist.ppf(1 - signif) conclusion = 'fail to reject' if statistic < crit_value else 'reject' results = { 'statistic' : statistic, 'crit_value' : crit_value, 'pvalue' : pvalue, 'df' : df, 'conclusion' : conclusion, 'signif' : signif } if verbose: summ = output.causality_summary(results, variables, equation, kind) print summ return results
'Normal Distribution', 'Z', 'P(Z)','') # Exponential distribution showDistribution(stats.expon, stats.expon(loc=-2, scale=4), 'Exponential Distribution', 'X', 'P(X)','') # Students' T-distribution # ... with 4, and with 10 degrees of freedom (DOF) plot(x, stats.norm.pdf(x), 'g') hold(True) showDistribution(stats.t(4), stats.t(10), 'T-Distribution', 'X', 'P(X)',['normal', 't=4', 't=10']) # F-distribution # ... with (3,4) and (10,15) DOF showDistribution(stats.f(3,4), stats.f(10,15), 'F-Distribution', 'F', 'P(F)',['(3,4) DOF', '(10,15) DOF']) # Uniform distribution showDistribution(stats.uniform,'' , 'Uniform Distribution', 'X', 'P(X)','') # Logistic distribution showDistribution(stats.norm, stats.logistic, 'Logistic Distribution', 'X', 'P(X)',['Normal', 'Logistic']) # Lognormal distribution x = logspace(-9,1,1001)+1e-9 showDistribution(stats.lognorm(2), '', 'Lognormal Distribution', 'X', 'lognorm(X)','', xmin=-0.1)