def z_test(data1, data2=None, tail="both", mu=0, sigma1=1, sigma2=None): """ z检验 :param data1: 样本 1 :param data2: 样本 2 :param tail: 是否双尾检验, 默认是 :param mu: μ值 :param sigma1: :param sigma2: :return: """ assert tail in ["both", "left", "right"], \ 'tail should be one of "both", "left", "right"' if data2 is None: # 样本均值 mean_value = mean(data1) # 标准误 se = sigma1 / sqrt(len(data1)) z_value = (mean_value - mu) / se else: assert sigma2 is not None mean_diff = mean(data1) - mean(data2) se = sqrt(sigma1**2 / len(data1) + sigma2**2 / len(data2)) z_value = (mean_diff - mu) / se if tail == "both": # 计算面积 p = 2 * (1 - norm.cdf(abs(z_value))) elif tail == "left": p = norm.cdf(z_value) else: p = 1 - norm.cdf(z_value) return z_value, p
def mean_diff_ci_t_est(data1, data2, alpha, equal=True): # 样本容量 1 n1 = len(data1) # 样本容量 2 n2 = len(data2) # 均值差 mean_diff = mean(data1) - mean(data2) # 样本方差 sample1_var = variance(data1) sample2_var = variance(data2) # 两个总体方差未知且相等,求均值差的置信区间 if equal: # 联合标准差 sw = sqrt( ((n1 - 1) * sample1_var + (n2 - 1) * sample2_var) / (n1 + n2 - 2)) t_value = abs(t.ppf(alpha / 2, n1 + n2 - 2)) return mean_diff - sw * sqrt(1 / n1 + 1 / n2) * t_value, \ mean_diff + sw * sqrt(1 / n1 + 1 / n2) * t_value # 两个总体方差未知且不等,求均值差的置信区间 else: # 自由度 # 分子 df_numerator = (sample1_var / n1 + sample2_var / n2)**2 # 分母 df_denominator = (sample1_var / n1)**2 / (n1 - 1) + (sample2_var / n2)**2 / (n2 - 1) df = df_numerator / df_denominator t_value = abs(t.ppf(alpha / 2, df)) return mean_diff - sqrt(sample1_var / n1 + sample2_var / n2) * t_value, \ mean_diff + sqrt(sample1_var / n1 + sample2_var / n2) * t_value
def mean_diff_ci_z_est(data1, data2, alpha, sigma1, sigma2): # 样本容量 1 n1 = len(data1) # 样本容量 2 n2 = len(data2) # 均值差 mean_diff = mean(data1) - mean(data2) z_value = abs(norm.ppf(alpha / 2)) return mean_diff - sqrt(sigma1 ** 2 / n1 + sigma2 ** 2 / n2) * z_value, \ mean_diff + sqrt(sigma1 ** 2 / n1 + sigma2 ** 2 / n2) * z_value
def anova_oneway(data): """单因素方差分析""" k = len(data) assert k > 1 # 组均值 group_means = [mean(group) for group in data] # 组样本容量 group_szs = [len(group) for group in data] n = sum(group_szs) assert n > k # 总平均 grand_mean = sum( group_mean * group_sz for group_mean, group_sz in zip(group_means, group_szs)) / n # 平方和 sst = sum(sum((y - grand_mean)**2 for y in group) for group in data) ssg = sum((group_mean - grand_mean)**2 * group_sz for group_mean, group_sz in zip(group_means, group_szs)) sse = sst - ssg dfg = k - 1 dfe = n - k # 均方和 msg = ssg / dfg mse = sse / dfe f_value = msg / mse p = 1 - f.cdf(f_value, dfg, dfe) return f_value, dfg, dfe, p
def sample(num_of_samples, sample_sz): data = [] # 遍历样本 for _ in range(num_of_samples): # 从 0-1的均匀分布中抽取 sample_sz 的个体组成的样本,mean 计算样本均值 data.append(mean([random.uniform(0.0, 1.0) for _ in range(sample_sz)])) return data
def variance_bias(data): """无偏性方差""" n = len(data) if n <= 1: return None mean_value = mean(data) return sum((e - mean_value)**2 for e in data) / n
def t_test(data1, data2=None, tail="both", mu=0, equal=True): """ t检验 :param data1: 样本 1 :param data2: 样本 2 :param tail: 是否双尾检验, 默认是 :param mu: μ值 :param equal: :return: """ assert tail in ["both", "left", "right"], \ 'tail should be one of "both", "left", "right"' if data2 is None: mean_val = mean(data1) se = std(data1) / sqrt(len(data1)) t_val = (mean_val - mu) / se df = len(data1) - 1 else: n1 = len(data1) n2 = len(data2) mean_diff = mean(data1) - mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: sw = sqrt(((n1 - 1) * sample1_var + (n2 - 1) * sample2_var) / (n1 + n2 - 2)) t_val = (mean_diff - mu) / (sw * sqrt(1 / n1 + 1 / n2)) df = n1 + n2 - 2 else: se = sqrt(sample1_var / n1 + sample2_var / n2) t_val = (mean_diff - mu) / se df_numerator = (sample1_var / n1 + sample2_var / n2)**2 df_denominator = (sample1_var / n1)**2 / (n1 - 1) + ( sample2_var / n2)**2 / (n2 - 1) df = df_numerator / df_denominator if tail == "both": p = 2 * (1 - t.cdf(abs(t_val), df)) elif tail == "left": p = t.cdf(t_val, df) else: p = 1 - t.cdf(t_val, df) return t_val, df, p
def simple_linear_reg(y, x): """一元线性回归""" assert len(x) == len(y) n = len(x) assert n > 1 mean_x = mean(x) mean_y = mean(y) beta1 = covariance(x, y) / variance(x) beta0 = mean_y - beta1 * mean_x y_hat = [beta0 + beta1 * e for e in x] ss_residual = sum((e1 - e2)**2 for e1, e2 in zip(y, y_hat)) se_model = sqrt(ss_residual / (n - 2)) t_value = beta1 / (se_model / sqrt((n - 1) * variance(x))) p = 2 * (1 - t.cdf(abs(t_value), n - 2)) return beta0, beta1, t_value, n - 2, p
def mean_ci_est(data, alpha, sigma=None): """ci-置信区间, est-均值置信区间""" n = len(data) sample_mean = mean(data) if sigma is None: # 方差未知 s = std(data) se = s / sqrt(n) t_value = abs(t.ppf(alpha / 2, n - 1)) return sample_mean - se * t_value, sample_mean + se * t_value else: # 方差已知 se = sigma / sqrt(n) # 标准误 # 计算 Z 值 z_value = abs(norm.ppf(alpha / 2)) return sample_mean - se * z_value, sample_mean + se * z_value
return sum((e - mean_value)**2 for e in data) / n def sample(num_of_samples, sample_sz, var): data = [] # 遍历样本 for _ in range(num_of_samples): # 从 0-1的均匀分布中抽取 sample_sz 的个体组成的样本,mean 计算样本均值 data.append(var([random.uniform(0.0, 1.0) for _ in range(sample_sz)])) return data if __name__ == '__main__': """有偏""" data1 = sample(1000, 40, variance_bias) plt.hist(data1, bins="auto", rwidth=0.8) # 样本方差均值 实验值 plt.axvline(x=mean(data1), c="000") # 总体方差均值 (b-a)^2/12 0.0, 1.0 理论值 plt.axvline(x=1 / 12, c="red") print("bias: ", mean(data1), 1 / 12) plt.show() """无偏""" data2 = sample(1000, 40, variance) plt.hist(data1, bins="auto", rwidth=0.8) # 样本方差均值 实验值 plt.axvline(x=mean(data2), c="000") # 总体方差均值 (b-a)^2/12 0.0, 1.0 理论值 plt.axvline(x=1 / 12, c="red") print("un_bias: ", mean(data2), 1 / 12) plt.show()
# 测试众数 zs, zs_count = mode(data) print("zs", zs, zs_count) # 测试中位数 data_zws1 = [1, 2, 3, 4] data_zws2 = [1, 2, 3, 4, 5] data_zws3 = [1, 2, 3, 4, 99] zws1 = median(data_zws1) zws2 = median(data_zws2) zws3 = median(data_zws3) print("zws", zws1, zws2, zws3) # 再次认证中位数和极端值没有关联,是集中趋势 # 测试均值 data_jz = [1, 2, 3, 4, 5] jz = mean(data_jz) print("jz", jz) # 测试极差 data_jc = [1, 2, 3, 999] jc = rng(data_jc) print("jc", jc) # 测试四分位数 data_sfws = [1, 4, 2, 3, 5] sfws = quartile(data_sfws) print(sfws) # 测试方差 data_fc = [1, 4, 2, 3, 5] fc = variance(data_fc)
import random import matplotlib.pyplot as plt from stats.descriptive_stats import mean, variance if __name__ == '__main__': """相合性""" # 样本均值 sample_means = [] # 样本方差 sample_vars = [] # 样本容量 indices = [] for sz in range(20, 10001, 50): indices.append(sz) # 调用高斯分布 sample = [random.gauss(0.0, 1.0) for _ in range(sz)] sample_means.append(mean(sample)) sample_vars.append(variance(sample)) plt.plot(indices, sample_means) plt.plot(indices, sample_vars) """结论,当样本越大时,样本均值逐渐趋向于样本方差,这就是相合性。""" plt.show()
import random, matplotlib.pyplot as plt from stats.descriptive_stats import mean def sample(num_of_samples, sample_sz): data = [] # 遍历样本 for _ in range(num_of_samples): # 从 0-1的均匀分布中抽取 sample_sz 的个体组成的样本,mean 计算样本均值 data.append(mean([random.uniform(0.0, 1.0) for _ in range(sample_sz)])) return data """中心极限定理""" if __name__ == '__main__': data = sample(1000, 40) plt.hist(data, bins="auto", rwidth=0.8) # 绘制均值线 plt.axvline(x=mean(data), c="red") plt.show()