def t_test(data1, data2=None, tail='both', mu=0, equal=True): assert tail in ['both', 'left', 'right'], 'tail should be one of "both","left","right"' if data2 is None: mean_val = mean(data1) se = std(data1)/sqrt(len(data1)) t_val = (mean_val-mu)/se df = len(data1)-1 else: n1 = len(data1) n2 = len(data2) mean_diff = mean(data1)-mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: sw = sqrt(((n1-1)*sample1_var+(n2-1)*sample2_var)/(n1+n2-2)) t_val = (mean_diff-mu)/(sw*sqrt(1/n1+1/n2)) df = n1 + n2 - 2 else: se = sqrt(sample1_var/n1+sample2_var/n2) t_val = (mean_diff-mu)/se df_numerator = (sample1_var/n1+sample2_var/n2)**2 df_denominator = (sample1_var/n1)**2/(n1-1) + \ (sample2_var/n2)**2/(n2-1) df = df_numerator/df_denominator if tail == "both": p = 2*(1-t.cdf(abs(t_val), df)) elif tail == "left": p = t.cdf(t_val, df) else: p = 1-t.cdf(t_val, df) return t_val, df, p
def mean_diff_ci_z_est(data1,data2,alpha,sigma1,sigma2): n1 = len(data1) n2 = len(data2) mean_diff = mean(data1) - mean(data2) z_value = abs(norm.ppf(alpha/2)) return mean_diff - sqrt(sigma1 / n1 + sigma2/ n2) * z_value, \ mean_diff + sqrt(sigma1 / n1 + sigma2 / n2) * z_value
def anova_twoway(data): """双因素方差分析2×2""" r, s = 2, 2 data = np.array(data) group_szs = np.tile(np.size(data, axis=1), (np.size(data, axis=0), 1)) n = sum(group_szs) # 样本总数 # 计算均值 group_means = np.mean(data, axis=1) group_mean = group_means.dot(group_szs) / n group_i_means = np.array([mean(group_means[:2]), mean(group_means[2:])]) group_j_means = np.array([(group_means[0] + group_means[2]) / 2, (group_means[1] + group_means[3]) / 2]) # 计算i,j各水平的效应 group_i_effect = group_i_means - group_mean group_j_effect = group_j_means - group_mean # 计算i, j的交叉效应 group_ij_effect = (group_means.reshape(2, 2) - np.tile( group_mean, (2, 2))) - np.tile(group_i_effect, (2, 1)).T - np.tile(group_j_effect, (2, 1)) # 计算总变化 sst = np.sum((data - group_mean)**2) # 计算第一个因素引起的变化 ss_method = ((group_i_means - group_mean)**2).dot( [np.sum(group_szs[:2]), np.sum(group_szs[2:])]) # 计算第二个因素引起的变化 ss_reward = ((group_j_means - group_mean)**2).dot([ np.sum([group_szs[0], group_szs[2]]), np.sum([group_szs[1], group_szs[3]]) ]) # 计算第一个因素与第二个因素交互引起的变化 ss_mr = (group_ij_effect.reshape(1, 4)**2).dot(group_szs) # 其他因素引起的变化 ss_error = np.sum((data - group_means.reshape(-1, 1))**2) # 计算其他因素引起的误差 ms_error = ss_error / (n - r * s) # 计算第一个因素引起的变化ms值, f值, p值 ms_method = ss_method / (r - 1) f_ms_method = ms_method / ms_error p_ms_method = 1 - f.cdf(f_ms_method, r - 1, n - r * s) # 计算第二个因素引起的变化ms值, f值, p值 ms_reward = ss_reward / (r - 1) f_ms_reward = ms_reward / ms_error p_ms_reward = 1 - f.cdf(f_ms_reward, r - 1, n - r * s) # 计算第一、二个因素交互引起的变化ms值, f值, p值 ms_mr = ss_mr / (r - 1) f_ms_mr = ms_mr / ms_error p_ms_mr = 1 - f.cdf(f_ms_mr, r - 1, n - r * s) # 整理输出矩阵各行 method = [r - 1, ss_method, ms_method, f_ms_method, p_ms_method] reward = [r - 1, ss_reward, ms_reward, f_ms_reward, p_ms_reward] mr = [r - 1, ss_mr, ms_mr, f_ms_mr, p_ms_mr] residuals = [n - r * s, ss_error, ms_error, None, None] return np.array([method, reward, mr, residuals]).astype(np.float32)
def z_test(data1, data2=None, tail="both", mu=0.0, sigma1=1.0, sigma2=None): assert tail in ['both', 'left', 'right'], 'tail should be one of "both", "left", "right"' if data2 is None: # 单个总体的情况 mean_val = mean(data1) se = sigma1 / np.sqrt(len(data1)) z_val = (mean_val - mu) / se else: # 两个总体的情况 assert sigma2 is not None mean_diff = mean(data1) - mean(data2) se = np.sqrt(sigma1**2 / len(data1) + sigma2**2 / len(data2)) z_val = (mean_diff - mu) / se if tail == 'both': # 双尾检验 p = 2 * (1 - norm.cdf(abs(z_val))) elif tail == 'left': # 左尾检验 p = norm.cdf(z_val) else: # 右尾检验 p = 1 - norm.cdf(z_val) return round(z_val, 2), p
def mean_diff_ci_z_est(data1, data2, alpha, sigma1, sigma2): """两个总体方差已知,求均值差的置信区间""" n1 = len(data1) n2 = len(data2) mean_diff = mean(data1) - mean(data2) z_value = abs(norm.ppf(alpha/2)) return round(mean_diff - np.sqrt(sigma1**2/n1 + sigma2**2/n2) * z_value, 2), \ round(mean_diff + np.sqrt(sigma1**2/n1 + sigma2**2/n2) * z_value, 2)
def mean_one_sided_lower_ci_est(data,alpha,sigma = None): n = len(data) if sigma is None: # 未知总体方差,使用t分布 t_value = abs(t.ppf(alpha,n-1)) s = std(data) return mean(data) - s / sqrt(n) * t_value,inf else: # 知道总体方差,使用标准正态分布 z_value = abs(norm.ppf(alpha)) return mean(data) - sigma/sqrt(n) * z_value,inf
def variance_bias(data): """有偏方差""" if data is None or len(data) <= 1: return None n = len(data) mean_value = mean(data) return sum((e - mean_value)**2 for e in data) / n
def anova_oneway(data): k = len(data) assert k > 1 group_means = [mean(group) for group in data] group_szs = [len(group) for group in data] n = sum(group_szs) assert n > k grand_mean = sum(group_mean * group_sz for group_mean, group_sz in zip(group_means, group_szs))/n sst = sum(sum((y-grand_mean)**2 for y in group)for group in data) ssg = sum((group_mean-grand_mean)**2*group_sz for group_mean, group_sz in zip(group_means, group_szs)) sse = sst-ssg dfg = k-1 dfe = n-k msg = ssg/dfg mse = sse/dfe f_value = msg/mse p = 1-f.cdf(f_value, dfg, dfe) return f_value, dfg, dfe, p
def anova_oneway(data): """单因素方差分析""" k = len(data) # 类别数 assert k > 1, '数据量得大于1' group_means = [mean(group) for group in data] group_szs = [len(group) for group in data] n = sum(group_szs) # 每个类别中元素个数之和,即数据总个数 assert n > k group_mean = sum( group_mean * group_sz for group_mean, group_sz in zip(group_means, group_szs)) / n sst = np.sum((np.array(data) - group_mean)**2) ssg = ((np.array(group_means) - group_mean)**2).dot(np.array(group_szs)) sse = np.sum((np.array(data) - np.array(group_means).reshape(-1, 1))**2) assert round(sse, 2) == round(sst - ssg, 2) dfg = k - 1 dfe = n - k msg = ssg / dfg mse = sse / dfe f_value = msg / mse p = 1 - f.cdf(f_value, dfg, dfe) return round(f_value, 2), dfg, dfe, p
def variance_bias(data): """有偏方差(因为除以n)""" n=len(data) if n<=1: return None mean_value=mean(data) return sum((e-mean_value)**2 for e in data)/n
def sample(num_of_samples,sample_sz): ''' 返回样本数为10000,样本容量为40的满足均匀分布的样本均值列表 ''' data=[] for _ in range(num_of_samples): data.append(mean([random.uniform(0.0,1.0) for _ in range(sample_sz)])) return data
def t_test(data1, data2=None, tail='both', mu=0.0, equal=True): assert tail in ['both', 'left', 'right'], 'tail should be one of "both", "left", "right"' if data2 is None: # 单个总体的情况 mean_val = mean(data1) se = std(data1) / np.sqrt(len(data1)) t_val = (mean_val - mu) / se df = len(data1) - 1 else: # 两个总体的情况 n1 = len(data1) n2 = len(data2) mean_diff = mean(data1) - mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: # 方差相等的情况 sw = np.sqrt((((n1 - 1) * sample1_var + (n2 - 1) * sample2_var)) / (n1 + n2 - 2)) t_val = (mean_diff - mu) / (sw * np.sqrt(1 / n2 + 1 / n2)) df = n1 + n2 - 2 else: # 方差不等的情况 se = np.sqrt(sample1_var / n1 + sample2_var / n2) t_val = (mean_diff - mu) / se df = (sample1_var / n1 + sample2_var / n2)**2 / ( (sample1_var / n1)**2 / (n1 - 1) + (sample2_var / n2)**2 / (n2 - 1)) if tail == 'both': # 双尾检验 p = 2 * (1 - t.cdf(abs(t_val), df)) elif tail == 'left': # 左尾检验 p = t.cdf(t_val, df) else: # 右尾检验 p = 1 - t.cdf(t_val, df) return round(t_val, 2), round(df, 2), p
def simple_linear_reg(x, y): assert len(x) == len(y) n = len(x) assert n > 1 mean_x = mean(x) mean_y = mean(y) beta1 = covariance(x, y)/variance(x) beta0 = mean_y-beta1*mean_x y_hat = [beta0+beta1*e for e in x] ss_residual = sum((e1-e2)**2 for e1, e2 in zip(y, y_hat)) se_model = sqrt(ss_residual/(n-2)) t_value = beta1/(se_model/sqrt((n-1)*variance(x))) p = 2*(1-t.cdf(abs(t_value), n-2)) return beta0, beta1, t_value, n-2, p
def mean_diff_ci_t_est(data1, data2, alpha, equal=True): """总体方差未知, 求均值差的置信区间""" n1 = len(data1) n2 = len(data2) mean_diff = mean(data1) - mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: """两总体方差未知且相等""" sw = np.sqrt(((n1-1)*sample1_var + (n2-1)*sample2_var) / (n1+n2-2)) t_value = abs(t.ppf(alpha/2, n1+n2-2)) return round(mean_diff - sw*np.sqrt(1/n1+1/n2)*t_value, 2), \ round(mean_diff + sw*np.sqrt(1/n1+1/n2)*t_value, 2) else: """两总体方差未知且不等""" df = (sample1_var/n1 + sample2_var/n2)**2 / ((sample1_var/n1)**2 / (n1-1) + (sample2_var/n2)**2 / (n2-1)) t_value = abs(t.ppf(alpha/2, df)) return round(mean_diff - np.sqrt(sample1_var/n1 + sample2_var/n2) * t_value, 2), \ round(mean_diff + np.sqrt(sample1_var/n1 + sample2_var/n2) * t_value, 2)
def mean_diff_ci_t_est(data1, data2, alpha, equal=True): n1 = len(data1) n2 = len(data2) mean_diff = mean(data1)-mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: sw = sqrt(((n1-1)*sample1_var+(n2-1)*sample2_var)/(n1+n2-2)) t_value = abs(t.ppf(alpha/2, n1+n2-2)) return mean_diff - sw*sqrt(1/n1+1/n2) * t_value, \ mean_diff + sw*sqrt(1/n1+1/n2) * t_value else: df_numerator = (sample1_var/n1+sample2_var/n2)**2 df_denominator = (sample1_var/n1)**2/(n1-1)+(sample2_var/n2)**2/(n2-1) df = df_numerator/df_denominator t_value = abs(t.ppf(alpha/2, df)) return mean_diff - sqrt(sample1_var/n1+sample2_var/n2)*t_value,\ mean_diff + sqrt(sample1_var/n1+sample2_var/n2)*t_value
def z_test(data1, data2=None, tail="both", mu=0, sigma1=1, sigma2=None): assert tail in ["both", "left", "right"], 'tail should be one of "both","left","right"' if data2 is None: mean_val = mean(data1) se = sigma1/sqrt(len(data1)) z_val = (mean_val-mu)/se else: assert sigma2 is not None mean_diff = mean(data1)-mean(data2) se = sqrt(sigma1**2/len(data1)+sigma2**2/len(data2)) z_val = (mean_diff-mu)/se if tail == "both": p = 2*(1-norm.cdf(abs(z_val))) elif tail == "left": p = norm.cdf(z_val) else: p = 1-norm.cdf(z_val) return z_val, p
def mean_ci_est(data, alpha, sigma=None): # confidence interval n = len(data) sample_mean = mean(data) if sigma is None: s = std(data) se = s/sqrt(n) t_value = abs(t.ppf(alpha/2,n-1)) return sample_mean - se * t_value, sample_mean + se * t_value else: se = sigma/sqrt(n) z_value = abs(norm.ppf(alpha / 2)) # ppf默认下分位点,故使用abs return sample_mean - se * z_value, sample_mean + se * z_value
def mean_ci_est(data, alpha, sigma=None): n = len(data) sample_mean = mean(data) if sigma is None: # 方差未知 s = std(data) se = s/sqrt(n) t_value = abs(t.ppf(alpha/2, n-1)) return sample_mean - se * t_value, sample_mean + se * t_value else: # 方差已知 se = sigma/sqrt(n) z_value = abs(norm.ppf(alpha/2)) return sample_mean - se * z_value, sample_mean + se * z_value
def mean_ci_est(data, alpha, sigma=None): """均值的区间估计""" n = len(data) sample_mean = mean(data) if sigma is None: # 方差未知 s = std(data) me = s / np.sqrt(n) t_value = abs(t.ppf(alpha/2, n-1)) return round(sample_mean - me * t_value, 2), round(sample_mean + me * t_value, 2) else: # 方差已知 me = sigma / np.sqrt(n) z_value = abs(norm.ppf(alpha/2)) return round(sample_mean - me * z_value, 2), round(sample_mean + me * z_value, 2)
def mean_ci_est(data, alpha, sigma=None): """ 总体方差未知,求均值的置信空间 总体方差已知,求均值的置信空间 data为传入的样本; alpha,sigma为需要传入的置信水平,sigma的值 """ n = len(data) #求样本容量 sample_mean = mean(data) #求样本均值 if sigma is None: #方差未知 s = std(data) #求样本方差 se = s / sqrt(n) #求标准误 t_value = abs(t.ppf(alpha / 2, n - 1)) #求Z return sample_mean - se * t_value, sample_mean + se * t_value else: #方差已知 se = sigma / sqrt #求标准误 z_value = abs(norm.ppf(alpha / 2)) #求Z,由于取的Z alpha/2默认是返回坐标左边的面积,所以需要取绝对值 return sample_mean - se * z_value, sample_mean + se * z_value
def variance_bias(data): """有偏方差(因为除以n)""" n=len(data) if n<=1: return None mean_value=mean(data) return sum((e-mean_value)**2 for e in data)/n def sample(num_of_samples,sample_sz,var): ''' 返回样本数为num_of_samples,样本容量为sample_sz的方差列表 ''' data=[] for _ in range(num_of_samples): data.append(var([random.uniform(0.0,1.0) for _ in range(sample_sz)])) return data if __name__ == '__main__': data1=sample(1000,40,variance_bias) #有偏方差的情况 plt.hist(data1,bins="auto",rwidth=0.8) plt.axvline(x=mean(data1),c='black') #基于有偏方差计算出来的均值 plt.axvline(x=1/12,c='red') #对于均匀分布来讲(random.uniform),它总体方差的计算公式为(b-a)^2/12 print("bias: ",mean(data1),1/12) plt.show() data2 = sample(1000, 40, variance) #无偏方差的情况 plt.hist(data2, bins="auto", rwidth=0.8) plt.axvline(x=mean(data2), c='black') #基于无偏方差计算出来的均值 plt.axvline(x=1 / 12, c='red') #对于均匀分布来讲(random.uniform),它总体方差的计算公式为(b-a)^2/12 print("unbias: ", mean(data2), 1 / 12) plt.show()
总体均值未知,求方差的置信空间 data为传入的样本; alpha为需要传入的置信水平的值 """ n = len(data) #求样本容量 s2 = variance(data) #求样本方差 chi2_lower_value = chi2.ppf( alpha / 2, n - 1) #求坐标左侧Z面积,没错你没看错,因为数学证明的过程中是以右侧为基准的,但是scipy是以左侧为基准的 chi2_upper_value = chi2.ppf(1 - alpha / 2, n - 1) #求坐标右侧Z面积 return (n - 1) * s2 / chi2_upper_value, (n - 1) * s2 / chi2_lower_value if __name__ == '__main__': salary_18 = [1484, 785, 1598, 1366, 1716, 1020, 1716, 785, 3113, 1601] #18岁月收入数据 salary_35 = [902, 4508, 3809, 3923, 4276, 2065, 1601, 553, 3345, 2182] #35岁月收入数据 print(mean(salary_18)) #平均月收入的点估计 print(mean_ci_est(salary_18, 0.05)) #平均月收入的区间估计 print(mean(salary_35)) #平均月收入的点估计 print(mean_ci_est(salary_35, 0.05)) #平均月收入的区间估计 print() print(std(salary_18)) #整体方差的点估计开根 print(variance(salary_18)) #整体方差的点估计(样本方差) print(var_ci_est(salary_18, 0.05)) #区间估计 print(std(salary_35)) #整体方差的点估计开根 print(variance(salary_35)) #整体方差的点估计(样本方差) print(var_ci_est(salary_35, 0.05)) #区间估计 print()
def variance_bias(data): """有偏方差""" if data is None or len(data) <= 1: return None n = len(data) mean_value = mean(data) return sum((e - mean_value)**2 for e in data) / n def sample(num_of_samples, sample_sz, var): """从均匀分布中抽取num_of_samples个样本,每个样本容量sample_sz,返回num_of_samples样本方差""" data = [] for _ in range(num_of_samples): data.append(var([random.uniform(0.0, 1.0) for _ in range(sample_sz)])) return data if __name__ == '__main__': data1 = sample(1000, 40, variance_bias) data2 = sample(1000, 40, variance) plt.subplot(121) plt.hist(data1, bins="auto", rwidth=0.8) plt.axvline(x=mean(data1), c='y') plt.axvline(x=1 / 12, c='r') plt.subplot(122) plt.hist(data2, bins="auto", rwidth=0.8) plt.axvline(x=mean(data2), c='y') plt.axvline(x=1 / 12, c='r') plt.show()
from playStats.descriptive_stats import std if __name__ == '__main__': data = [2, 2, 2, 1, 1, 1, 3, 3] # 测试频率 print(frequency(data)) # 测试众数 print(mode(data)) # 测试中位数 print(median(data)) # 测试均值 print(mean(data)) # 测试极差 print(rng(data)) # 测试四分位数 print(quartile(data)) # 测试方差 print(variance(data)) # 测试标准差 print(std(data))
def sample(num_of_samples, sample_sz): data = [] for _ in range(num_of_samples): data.append(mean([random.uniform(0.0,1.0) for _ in range(sample_sz)])) #在0-1的均匀分布中抽取sample_sz数量的个体组成一个样本,取该样本均值 return data
import random import matplotlib.pyplot as plt from playStats.descriptive_stats import mean def sample(num_of_samples, sample_sz): data = [] for _ in range(num_of_samples): data.append(mean([random.uniform(0.0,1.0) for _ in range(sample_sz)])) #在0-1的均匀分布中抽取sample_sz数量的个体组成一个样本,取该样本均值 return data if __name__ == "__main__": data = sample(1000, 40) plt.hist(data, bins = 'auto', rwidth = 0.8) plt.axvline(x=mean(data),c = 'red') #呈现该组样本均值的均值所对应的垂直线, plt.show()
def variance_bias(data): """有偏方差""" n = len(data) if n <= 1: return None mean_value = mean(data) return sum((e - mean_value)**2 for e in data) / (n) def sample(num_of_samples, sample_sz, var): data = [] for _ in range(num_of_samples): data.append(var([random.uniform(0.0, 1.0) for _ in range(sample_sz)])) return data if __name__ == "__main__": data1 = sample(1000, 40, variance_bias) plt.hist(data1, bins="auto", rwidth=0.8) plt.axvline(x=mean(data1), c='black') plt.axvline(x=1 / 12, c='red') print("bias :", mean(data1), 1 / 12) plt.show() data2 = sample(1000, 40, variance) plt.hist(data2, bins="auto", rwidth=0.8) plt.axvline(x=mean(data2), c='black') plt.axvline(x=1 / 12, c='red') print("unbias :", mean(data2), 1 / 12) plt.show()
import random from playStats.descriptive_stats import mean, variance import matplotlib.pyplot as plt if __name__ == "__main__": sample_means = [] sample_vars = [] indices = [] for sz in range(20, 10001, 50): indices.append(sz) sample = [random.gauss(0.0, 1.0) for _ in range(sz)] sample_means.append(mean(sample)) sample_vars.append(variance(sample)) plt.plot(indices, sample_means) plt.plot(indices, sample_vars) plt.show()
import random from playStats.descriptive_stats import mean from playStats.descriptive_stats import variance import matplotlib.pyplot as plt if __name__ == '__main__': indices = [] data_mean = [] data_varvance = [] for sample_sz in range(20, 10001, 50): indices.append(sample_sz) sample = [random.gauss(0.0, 1.0) for _ in range(sample_sz)] data_mean.append(mean(sample)) data_varvance.append(variance(sample)) plt.plot(indices, data_mean) plt.axhline(0, c='r') plt.plot(indices, data_varvance) plt.axhline(1, c='b') plt.show()
if n <= 1: return None mean_value = mean(data) return sum((e - mean_value)**2 for e in data) / n def sample(num_of_samples, sample_sz, var): data = [] for _ in range(num_of_samples): data.append(var([random.uniform(0.0, 1.0) for _ in range(sample_sz) ])) #在0-1的均匀分布中抽取sample_sz数量的个体组成一个样本,取该样本均值 return data if __name__ == "__main__": #biased data1 = sample(1000, 40, variance_bias) plt.hist(data1, bins="auto", rwidth=0.8) plt.axvline(x=mean(data1), c="black") plt.axvline(x=1 / 12, c="red") #计算0-1均匀分布的总体的方差:(1-0)**2/12 print("bias: ", mean(data1), 1 / 12) #打印实验值和理论值 plt.show() #unbiased data2 = sample(1000, 40, variance) plt.hist(data2, bins="auto", rwidth=0.8) plt.axvline(x=mean(data2), c="black") plt.axvline(x=1 / 12, c="red") # 计算0-1均匀分布的总体的方差:(1-0)**2/12 print("unbias: ", mean(data2), 1 / 12) # 打印实验值和理论值 plt.show()