def z_test(data1, data2=None, tail="both", mu=0, sigma1=1, sigma2=None):
    """
    z检验
    :param data1: 样本 1
    :param data2: 样本 2
    :param tail: 是否双尾检验, 默认是
    :param mu: μ值
    :param sigma1:
    :param sigma2:
    :return:
    """

    assert tail in ["both", "left", "right"], \
        'tail should be one of "both", "left", "right"'

    if data2 is None:
        # 样本均值
        mean_value = mean(data1)
        # 标准误
        se = sigma1 / sqrt(len(data1))
        z_value = (mean_value - mu) / se
    else:
        assert sigma2 is not None
        mean_diff = mean(data1) - mean(data2)
        se = sqrt(sigma1**2 / len(data1) + sigma2**2 / len(data2))
        z_value = (mean_diff - mu) / se

    if tail == "both":
        # 计算面积
        p = 2 * (1 - norm.cdf(abs(z_value)))
    elif tail == "left":
        p = norm.cdf(z_value)
    else:
        p = 1 - norm.cdf(z_value)
    return z_value, p
Esempio n. 2
0
def mean_diff_ci_t_est(data1, data2, alpha, equal=True):
    # 样本容量 1
    n1 = len(data1)
    # 样本容量 2
    n2 = len(data2)
    # 均值差
    mean_diff = mean(data1) - mean(data2)
    # 样本方差
    sample1_var = variance(data1)
    sample2_var = variance(data2)

    # 两个总体方差未知且相等,求均值差的置信区间
    if equal:
        # 联合标准差
        sw = sqrt(
            ((n1 - 1) * sample1_var + (n2 - 1) * sample2_var) / (n1 + n2 - 2))
        t_value = abs(t.ppf(alpha / 2, n1 + n2 - 2))
        return mean_diff - sw * sqrt(1 / n1 + 1 / n2) * t_value, \
               mean_diff + sw * sqrt(1 / n1 + 1 / n2) * t_value

    # 两个总体方差未知且不等,求均值差的置信区间
    else:
        # 自由度
        # 分子
        df_numerator = (sample1_var / n1 + sample2_var / n2)**2
        # 分母
        df_denominator = (sample1_var / n1)**2 / (n1 - 1) + (sample2_var /
                                                             n2)**2 / (n2 - 1)
        df = df_numerator / df_denominator
        t_value = abs(t.ppf(alpha / 2, df))
        return mean_diff - sqrt(sample1_var / n1 + sample2_var / n2) * t_value, \
               mean_diff + sqrt(sample1_var / n1 + sample2_var / n2) * t_value
Esempio n. 3
0
def mean_diff_ci_z_est(data1, data2, alpha, sigma1, sigma2):
    # 样本容量 1
    n1 = len(data1)
    # 样本容量 2
    n2 = len(data2)
    # 均值差
    mean_diff = mean(data1) - mean(data2)
    z_value = abs(norm.ppf(alpha / 2))
    return mean_diff - sqrt(sigma1 ** 2 / n1 + sigma2 ** 2 / n2) * z_value, \
           mean_diff + sqrt(sigma1 ** 2 / n1 + sigma2 ** 2 / n2) * z_value
def anova_oneway(data):
    """单因素方差分析"""
    k = len(data)
    assert k > 1

    # 组均值
    group_means = [mean(group) for group in data]
    # 组样本容量
    group_szs = [len(group) for group in data]
    n = sum(group_szs)
    assert n > k

    # 总平均
    grand_mean = sum(
        group_mean * group_sz
        for group_mean, group_sz in zip(group_means, group_szs)) / n

    # 平方和
    sst = sum(sum((y - grand_mean)**2 for y in group) for group in data)
    ssg = sum((group_mean - grand_mean)**2 * group_sz
              for group_mean, group_sz in zip(group_means, group_szs))
    sse = sst - ssg

    dfg = k - 1
    dfe = n - k

    # 均方和
    msg = ssg / dfg
    mse = sse / dfe

    f_value = msg / mse
    p = 1 - f.cdf(f_value, dfg, dfe)

    return f_value, dfg, dfe, p
Esempio n. 5
0
def sample(num_of_samples, sample_sz):
    data = []
    # 遍历样本
    for _ in range(num_of_samples):
        # 从 0-1的均匀分布中抽取 sample_sz 的个体组成的样本,mean 计算样本均值
        data.append(mean([random.uniform(0.0, 1.0) for _ in range(sample_sz)]))
    return data
Esempio n. 6
0
def variance_bias(data):
    """无偏性方差"""
    n = len(data)
    if n <= 1:
        return None

    mean_value = mean(data)
    return sum((e - mean_value)**2 for e in data) / n
def t_test(data1, data2=None, tail="both", mu=0, equal=True):
    """
    t检验
    :param data1: 样本 1
    :param data2: 样本 2
    :param tail: 是否双尾检验, 默认是
    :param mu: μ值
    :param equal:
    :return:
    """
    assert tail in ["both", "left", "right"], \
        'tail should be one of "both", "left", "right"'

    if data2 is None:
        mean_val = mean(data1)
        se = std(data1) / sqrt(len(data1))
        t_val = (mean_val - mu) / se
        df = len(data1) - 1
    else:
        n1 = len(data1)
        n2 = len(data2)
        mean_diff = mean(data1) - mean(data2)
        sample1_var = variance(data1)
        sample2_var = variance(data2)

        if equal:
            sw = sqrt(((n1 - 1) * sample1_var + (n2 - 1) * sample2_var) /
                      (n1 + n2 - 2))
            t_val = (mean_diff - mu) / (sw * sqrt(1 / n1 + 1 / n2))
            df = n1 + n2 - 2
        else:
            se = sqrt(sample1_var / n1 + sample2_var / n2)
            t_val = (mean_diff - mu) / se
            df_numerator = (sample1_var / n1 + sample2_var / n2)**2
            df_denominator = (sample1_var / n1)**2 / (n1 - 1) + (
                sample2_var / n2)**2 / (n2 - 1)
            df = df_numerator / df_denominator

    if tail == "both":
        p = 2 * (1 - t.cdf(abs(t_val), df))
    elif tail == "left":
        p = t.cdf(t_val, df)
    else:
        p = 1 - t.cdf(t_val, df)

    return t_val, df, p
def simple_linear_reg(y, x):
    """一元线性回归"""
    assert len(x) == len(y)

    n = len(x)
    assert n > 1

    mean_x = mean(x)
    mean_y = mean(y)

    beta1 = covariance(x, y) / variance(x)
    beta0 = mean_y - beta1 * mean_x

    y_hat = [beta0 + beta1 * e for e in x]
    ss_residual = sum((e1 - e2)**2 for e1, e2 in zip(y, y_hat))
    se_model = sqrt(ss_residual / (n - 2))

    t_value = beta1 / (se_model / sqrt((n - 1) * variance(x)))
    p = 2 * (1 - t.cdf(abs(t_value), n - 2))

    return beta0, beta1, t_value, n - 2, p
Esempio n. 9
0
def mean_ci_est(data, alpha, sigma=None):
    """ci-置信区间, est-均值置信区间"""
    n = len(data)
    sample_mean = mean(data)

    if sigma is None:
        #  方差未知
        s = std(data)
        se = s / sqrt(n)
        t_value = abs(t.ppf(alpha / 2, n - 1))
        return sample_mean - se * t_value, sample_mean + se * t_value
    else:
        #  方差已知
        se = sigma / sqrt(n)  # 标准误
        #  计算 Z 值
        z_value = abs(norm.ppf(alpha / 2))
        return sample_mean - se * z_value, sample_mean + se * z_value
Esempio n. 10
0
    return sum((e - mean_value)**2 for e in data) / n


def sample(num_of_samples, sample_sz, var):
    data = []
    # 遍历样本
    for _ in range(num_of_samples):
        # 从 0-1的均匀分布中抽取 sample_sz 的个体组成的样本,mean 计算样本均值
        data.append(var([random.uniform(0.0, 1.0) for _ in range(sample_sz)]))
    return data


if __name__ == '__main__':
    """有偏"""
    data1 = sample(1000, 40, variance_bias)
    plt.hist(data1, bins="auto", rwidth=0.8)
    # 样本方差均值 实验值
    plt.axvline(x=mean(data1), c="000")
    # 总体方差均值 (b-a)^2/12  0.0, 1.0 理论值
    plt.axvline(x=1 / 12, c="red")
    print("bias: ", mean(data1), 1 / 12)
    plt.show()
    """无偏"""
    data2 = sample(1000, 40, variance)
    plt.hist(data1, bins="auto", rwidth=0.8)
    # 样本方差均值 实验值
    plt.axvline(x=mean(data2), c="000")
    # 总体方差均值 (b-a)^2/12  0.0, 1.0 理论值
    plt.axvline(x=1 / 12, c="red")
    print("un_bias: ", mean(data2), 1 / 12)
    plt.show()
    # 测试众数
    zs, zs_count = mode(data)
    print("zs", zs, zs_count)

    # 测试中位数
    data_zws1 = [1, 2, 3, 4]
    data_zws2 = [1, 2, 3, 4, 5]
    data_zws3 = [1, 2, 3, 4, 99]
    zws1 = median(data_zws1)
    zws2 = median(data_zws2)
    zws3 = median(data_zws3)
    print("zws", zws1, zws2, zws3)  # 再次认证中位数和极端值没有关联,是集中趋势

    # 测试均值
    data_jz = [1, 2, 3, 4, 5]
    jz = mean(data_jz)
    print("jz", jz)

    # 测试极差
    data_jc = [1, 2, 3, 999]
    jc = rng(data_jc)
    print("jc", jc)

    # 测试四分位数
    data_sfws = [1, 4, 2, 3, 5]
    sfws = quartile(data_sfws)
    print(sfws)

    # 测试方差
    data_fc = [1, 4, 2, 3, 5]
    fc = variance(data_fc)
Esempio n. 12
0
import random
import matplotlib.pyplot as plt
from stats.descriptive_stats import mean, variance

if __name__ == '__main__':
    """相合性"""

    # 样本均值
    sample_means = []
    # 样本方差
    sample_vars = []
    # 样本容量
    indices = []

    for sz in range(20, 10001, 50):
        indices.append(sz)
        # 调用高斯分布
        sample = [random.gauss(0.0, 1.0) for _ in range(sz)]
        sample_means.append(mean(sample))
        sample_vars.append(variance(sample))

    plt.plot(indices, sample_means)
    plt.plot(indices, sample_vars)
    """结论,当样本越大时,样本均值逐渐趋向于样本方差,这就是相合性。"""
    plt.show()
Esempio n. 13
0
import random, matplotlib.pyplot as plt
from stats.descriptive_stats import mean


def sample(num_of_samples, sample_sz):
    data = []
    # 遍历样本
    for _ in range(num_of_samples):
        # 从 0-1的均匀分布中抽取 sample_sz 的个体组成的样本,mean 计算样本均值
        data.append(mean([random.uniform(0.0, 1.0) for _ in range(sample_sz)]))
    return data


"""中心极限定理"""
if __name__ == '__main__':
    data = sample(1000, 40)
    plt.hist(data, bins="auto", rwidth=0.8)
    # 绘制均值线
    plt.axvline(x=mean(data), c="red")
    plt.show()