def f_test(data1, data2, tail="both", ratio=1): """ F 分布 :param data1: 样本值 1 :param data2: 样本值 2 :param tail: 尾类型 :param ratio: :return: """ assert tail in ["both", "left", "right"], \ 'tail should be one of “both”, “left”, “right”' n1 = len(data1) n2 = len(data2) sample1_var = variance(data1) sample2_var = variance(data2) f_val = sample1_var / sample2_var / ratio df1 = n1 - 1 df2 = n2 - 1 if tail == "both": p = 2 * min(1 - f.cdf(f_val, df1, df2), f.cdf(f_val, df1, df2)) elif tail == "left": p = f.cdf(f_val, df1, df2) else: p = 1 - f.cdf(f_val, df1, df2) return f_val, df1, df2, p
def mean_diff_ci_t_est(data1, data2, alpha, equal=True): # 样本容量 1 n1 = len(data1) # 样本容量 2 n2 = len(data2) # 均值差 mean_diff = mean(data1) - mean(data2) # 样本方差 sample1_var = variance(data1) sample2_var = variance(data2) # 两个总体方差未知且相等,求均值差的置信区间 if equal: # 联合标准差 sw = sqrt( ((n1 - 1) * sample1_var + (n2 - 1) * sample2_var) / (n1 + n2 - 2)) t_value = abs(t.ppf(alpha / 2, n1 + n2 - 2)) return mean_diff - sw * sqrt(1 / n1 + 1 / n2) * t_value, \ mean_diff + sw * sqrt(1 / n1 + 1 / n2) * t_value # 两个总体方差未知且不等,求均值差的置信区间 else: # 自由度 # 分子 df_numerator = (sample1_var / n1 + sample2_var / n2)**2 # 分母 df_denominator = (sample1_var / n1)**2 / (n1 - 1) + (sample2_var / n2)**2 / (n2 - 1) df = df_numerator / df_denominator t_value = abs(t.ppf(alpha / 2, df)) return mean_diff - sqrt(sample1_var / n1 + sample2_var / n2) * t_value, \ mean_diff + sqrt(sample1_var / n1 + sample2_var / n2) * t_value
def var_ratio_ci_est(data1, data2, alpha): # 样本容量 1 n1 = len(data1) # 样本容量 2 n2 = len(data2) # 置信下限 f_lower_value = f.ppf(alpha / 2, n1 - 1, n2 - 1) # 置信上限 f_upper_value = f.ppf(1 - alpha / 2, n1 - 1, n2 - 1) # 方差比 var_ratio = variance(data1) / variance(data2) return var_ratio / f_upper_value, var_ratio / f_lower_value
def t_test(data1, data2=None, tail="both", mu=0, equal=True): """ t检验 :param data1: 样本 1 :param data2: 样本 2 :param tail: 是否双尾检验, 默认是 :param mu: μ值 :param equal: :return: """ assert tail in ["both", "left", "right"], \ 'tail should be one of "both", "left", "right"' if data2 is None: mean_val = mean(data1) se = std(data1) / sqrt(len(data1)) t_val = (mean_val - mu) / se df = len(data1) - 1 else: n1 = len(data1) n2 = len(data2) mean_diff = mean(data1) - mean(data2) sample1_var = variance(data1) sample2_var = variance(data2) if equal: sw = sqrt(((n1 - 1) * sample1_var + (n2 - 1) * sample2_var) / (n1 + n2 - 2)) t_val = (mean_diff - mu) / (sw * sqrt(1 / n1 + 1 / n2)) df = n1 + n2 - 2 else: se = sqrt(sample1_var / n1 + sample2_var / n2) t_val = (mean_diff - mu) / se df_numerator = (sample1_var / n1 + sample2_var / n2)**2 df_denominator = (sample1_var / n1)**2 / (n1 - 1) + ( sample2_var / n2)**2 / (n2 - 1) df = df_numerator / df_denominator if tail == "both": p = 2 * (1 - t.cdf(abs(t_val), df)) elif tail == "left": p = t.cdf(t_val, df) else: p = 1 - t.cdf(t_val, df) return t_val, df, p
def var_ci_est(data, alpha): """方差的置信区间""" n = len(data) s2 = variance(data) # 卡方分布 chi2_lower_value = chi2.ppf(alpha / 2, n - 1) chi2_upper_value = chi2.ppf(1 - alpha / 2, n - 1) return (n - 1) * s2 / chi2_upper_value, (n - 1) * s2 / chi2_lower_value
def simple_linear_reg(y, x): """一元线性回归""" assert len(x) == len(y) n = len(x) assert n > 1 mean_x = mean(x) mean_y = mean(y) beta1 = covariance(x, y) / variance(x) beta0 = mean_y - beta1 * mean_x y_hat = [beta0 + beta1 * e for e in x] ss_residual = sum((e1 - e2)**2 for e1, e2 in zip(y, y_hat)) se_model = sqrt(ss_residual / (n - 2)) t_value = beta1 / (se_model / sqrt((n - 1) * variance(x))) p = 2 * (1 - t.cdf(abs(t_value), n - 2)) return beta0, beta1, t_value, n - 2, p
def chi2_test(data, tail="both", sigma2=1): """ 卡方分布 :param data: 样本值 :param tail: 尾类型 :param sigma2: μ :return: """ assert tail in ["both", "left", "right"], \ 'tail should be one of “both”, “left”, “right”' n = len(data) sample_var = variance(data) chi2_val = (n - 1) * sample_var / sigma2 if tail == "both": p = 2 * min(1 - chi2.cdf(chi2_val, n - 1), chi2.cdf(chi2_val, n - 1)) elif tail == "left": p = chi2.cdf(chi2_val, n - 1) else: p = 1 - chi2.cdf(chi2_val, n - 1) return chi2_val, n - 1, p
zws2 = median(data_zws2) zws3 = median(data_zws3) print("zws", zws1, zws2, zws3) # 再次认证中位数和极端值没有关联,是集中趋势 # 测试均值 data_jz = [1, 2, 3, 4, 5] jz = mean(data_jz) print("jz", jz) # 测试极差 data_jc = [1, 2, 3, 999] jc = rng(data_jc) print("jc", jc) # 测试四分位数 data_sfws = [1, 4, 2, 3, 5] sfws = quartile(data_sfws) print(sfws) # 测试方差 data_fc = [1, 4, 2, 3, 5] fc = variance(data_fc) bzc = std(data_fc) print(fc, bzc) # 测试协方差,相关系数 score = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] happy = [1, 3, 2, 6, 4, 5, 8, 10, 9, 7] print(covariance(score, happy)) print(cor(score, happy))
import random import matplotlib.pyplot as plt from stats.descriptive_stats import mean, variance if __name__ == '__main__': """相合性""" # 样本均值 sample_means = [] # 样本方差 sample_vars = [] # 样本容量 indices = [] for sz in range(20, 10001, 50): indices.append(sz) # 调用高斯分布 sample = [random.gauss(0.0, 1.0) for _ in range(sz)] sample_means.append(mean(sample)) sample_vars.append(variance(sample)) plt.plot(indices, sample_means) plt.plot(indices, sample_vars) """结论,当样本越大时,样本均值逐渐趋向于样本方差,这就是相合性。""" plt.show()