def t_test(data1, data2=None, tail='both', mu=0, equal=True): assert tail in ['both', 'left', 'right'], 'tail should be one of "both","left","right"' if data2 is None: mean_val = mean(data1) se = std(data1)/sqrt(len(data1)) t_val = (mean_val-mu)/se df = len(data1)-1 else: n1 = len(data1) n2 = len(data2) mean_diff = mean(data1)-mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: sw = sqrt(((n1-1)*sample1_var+(n2-1)*sample2_var)/(n1+n2-2)) t_val = (mean_diff-mu)/(sw*sqrt(1/n1+1/n2)) df = n1 + n2 - 2 else: se = sqrt(sample1_var/n1+sample2_var/n2) t_val = (mean_diff-mu)/se df_numerator = (sample1_var/n1+sample2_var/n2)**2 df_denominator = (sample1_var/n1)**2/(n1-1) + \ (sample2_var/n2)**2/(n2-1) df = df_numerator/df_denominator if tail == "both": p = 2*(1-t.cdf(abs(t_val), df)) elif tail == "left": p = t.cdf(t_val, df) else: p = 1-t.cdf(t_val, df) return t_val, df, p
def mean_one_sided_lower_ci_est(data,alpha,sigma = None): n = len(data) if sigma is None: # 未知总体方差,使用t分布 t_value = abs(t.ppf(alpha,n-1)) s = std(data) return mean(data) - s / sqrt(n) * t_value,inf else: # 知道总体方差,使用标准正态分布 z_value = abs(norm.ppf(alpha)) return mean(data) - sigma/sqrt(n) * z_value,inf
def mean_ci_est(data, alpha, sigma=None): # confidence interval n = len(data) sample_mean = mean(data) if sigma is None: s = std(data) se = s/sqrt(n) t_value = abs(t.ppf(alpha/2,n-1)) return sample_mean - se * t_value, sample_mean + se * t_value else: se = sigma/sqrt(n) z_value = abs(norm.ppf(alpha / 2)) # ppf默认下分位点,故使用abs return sample_mean - se * z_value, sample_mean + se * z_value
def mean_ci_est(data, alpha, sigma=None): n = len(data) sample_mean = mean(data) if sigma is None: # 方差未知 s = std(data) se = s/sqrt(n) t_value = abs(t.ppf(alpha/2, n-1)) return sample_mean - se * t_value, sample_mean + se * t_value else: # 方差已知 se = sigma/sqrt(n) z_value = abs(norm.ppf(alpha/2)) return sample_mean - se * z_value, sample_mean + se * z_value
def mean_ci_est(data, alpha, sigma=None): """均值的区间估计""" n = len(data) sample_mean = mean(data) if sigma is None: # 方差未知 s = std(data) me = s / np.sqrt(n) t_value = abs(t.ppf(alpha/2, n-1)) return round(sample_mean - me * t_value, 2), round(sample_mean + me * t_value, 2) else: # 方差已知 me = sigma / np.sqrt(n) z_value = abs(norm.ppf(alpha/2)) return round(sample_mean - me * z_value, 2), round(sample_mean + me * z_value, 2)
def t_test(data1, data2=None, tail='both', mu=0.0, equal=True): assert tail in ['both', 'left', 'right'], 'tail should be one of "both", "left", "right"' if data2 is None: # 单个总体的情况 mean_val = mean(data1) se = std(data1) / np.sqrt(len(data1)) t_val = (mean_val - mu) / se df = len(data1) - 1 else: # 两个总体的情况 n1 = len(data1) n2 = len(data2) mean_diff = mean(data1) - mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: # 方差相等的情况 sw = np.sqrt((((n1 - 1) * sample1_var + (n2 - 1) * sample2_var)) / (n1 + n2 - 2)) t_val = (mean_diff - mu) / (sw * np.sqrt(1 / n2 + 1 / n2)) df = n1 + n2 - 2 else: # 方差不等的情况 se = np.sqrt(sample1_var / n1 + sample2_var / n2) t_val = (mean_diff - mu) / se df = (sample1_var / n1 + sample2_var / n2)**2 / ( (sample1_var / n1)**2 / (n1 - 1) + (sample2_var / n2)**2 / (n2 - 1)) if tail == 'both': # 双尾检验 p = 2 * (1 - t.cdf(abs(t_val), df)) elif tail == 'left': # 左尾检验 p = t.cdf(t_val, df) else: # 右尾检验 p = 1 - t.cdf(t_val, df) return round(t_val, 2), round(df, 2), p
def mean_ci_est(data, alpha, sigma=None): """ 总体方差未知,求均值的置信空间 总体方差已知,求均值的置信空间 data为传入的样本; alpha,sigma为需要传入的置信水平,sigma的值 """ n = len(data) #求样本容量 sample_mean = mean(data) #求样本均值 if sigma is None: #方差未知 s = std(data) #求样本方差 se = s / sqrt(n) #求标准误 t_value = abs(t.ppf(alpha / 2, n - 1)) #求Z return sample_mean - se * t_value, sample_mean + se * t_value else: #方差已知 se = sigma / sqrt #求标准误 z_value = abs(norm.ppf(alpha / 2)) #求Z,由于取的Z alpha/2默认是返回坐标左边的面积,所以需要取绝对值 return sample_mean - se * z_value, sample_mean + se * z_value
# 测试频率 print(frequency(data)) # 测试众数 print(mode(data)) # 测试中位数 print(median(data)) # 测试均值 print(mean(data)) # 测试极差 print(rng(data)) # 测试四分位数 print(quartile(data)) # 测试方差 print(variance(data)) # 测试标准差 print(std(data))
总体均值未知,求方差的置信空间 data为传入的样本; alpha为需要传入的置信水平的值 """ n = len(data) #求样本容量 s2 = variance(data) #求样本方差 chi2_lower_value = chi2.ppf( alpha / 2, n - 1) #求坐标左侧Z面积,没错你没看错,因为数学证明的过程中是以右侧为基准的,但是scipy是以左侧为基准的 chi2_upper_value = chi2.ppf(1 - alpha / 2, n - 1) #求坐标右侧Z面积 return (n - 1) * s2 / chi2_upper_value, (n - 1) * s2 / chi2_lower_value if __name__ == '__main__': salary_18 = [1484, 785, 1598, 1366, 1716, 1020, 1716, 785, 3113, 1601] #18岁月收入数据 salary_35 = [902, 4508, 3809, 3923, 4276, 2065, 1601, 553, 3345, 2182] #35岁月收入数据 print(mean(salary_18)) #平均月收入的点估计 print(mean_ci_est(salary_18, 0.05)) #平均月收入的区间估计 print(mean(salary_35)) #平均月收入的点估计 print(mean_ci_est(salary_35, 0.05)) #平均月收入的区间估计 print() print(std(salary_18)) #整体方差的点估计开根 print(variance(salary_18)) #整体方差的点估计(样本方差) print(var_ci_est(salary_18, 0.05)) #区间估计 print(std(salary_35)) #整体方差的点估计开根 print(variance(salary_35)) #整体方差的点估计(样本方差) print(var_ci_est(salary_35, 0.05)) #区间估计 print()