def test_independent_variable_singular():
    data1 = data.copy()
    data1["dup"] = data1["Drug"]
    mod = _MultivariateOLS.from_formula("Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * dup", data1)
    assert_raises(ValueError, mod.fit)
    mod = _MultivariateOLS.from_formula("Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * dup", data1)
    assert_raises(ValueError, mod.fit)
def test_independent_variable_singular():
    data1 = data.copy()
    data1['dup'] = data1['Drug']
    mod = _MultivariateOLS.from_formula(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * dup',
        data1)
    assert_raises(ValueError, mod.fit)
    mod = _MultivariateOLS.from_formula(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * dup',
        data1)
    assert_raises(ValueError, mod.fit)
def test_from_formula_vs_no_formula():
    mod = _MultivariateOLS.from_formula("Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted", data)
    r = mod.fit(method="svd")
    r0 = r.mv_test()
    endog, exog = patsy.dmatrices(
        "Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted", data, return_type="dataframe"
    )
    L = np.array([[1, 0, 0, 0, 0, 0]])
    # DataFrame input
    r = _MultivariateOLS(endog, exog).fit(method="svd")
    r1 = r.mv_test(hypotheses=[["Intercept", L, None]])
    assert_array_almost_equal(r1["Intercept"]["stat"].values, r0["Intercept"]["stat"].values, decimal=6)
    # Numpy array input
    r = _MultivariateOLS(endog.values, exog.values).fit(method="svd")
    r1 = r.mv_test(hypotheses=[["Intercept", L, None]])
    assert_array_almost_equal(r1["Intercept"]["stat"].values, r0["Intercept"]["stat"].values, decimal=6)
    L = np.array([[0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]])
    r1 = r.mv_test(hypotheses=[["Drug", L, None]])
    # DataFrame input
    r = _MultivariateOLS(endog, exog).fit(method="svd")
    r1 = r.mv_test(hypotheses=[["Drug", L, None]])
    assert_array_almost_equal(r1["Drug"]["stat"].values, r0["Drug"]["stat"].values, decimal=6)
    # Numpy array input
    r = _MultivariateOLS(endog.values, exog.values).fit(method="svd")
    r1 = r.mv_test(hypotheses=[["Drug", L, None]])
    assert_array_almost_equal(r1["Drug"]["stat"].values, r0["Drug"]["stat"].values, decimal=6)
def test_from_formula_vs_no_formula():
    mod = _MultivariateOLS.from_formula(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
        data)
    r = mod.fit(method='svd')
    r0 = r.mv_test()
    endog, exog = patsy.dmatrices(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
        data, return_type="dataframe")
    L = np.array([[1, 0, 0, 0, 0, 0]])
    # DataFrame input
    r = _MultivariateOLS(endog, exog).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Intercept', L, None]])
    assert_array_almost_equal(r1['Intercept']['stat'].values,
                              r0['Intercept']['stat'].values, decimal=6)
    # Numpy array input
    r = _MultivariateOLS(endog.values, exog.values).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Intercept', L, None]])
    assert_array_almost_equal(r1['Intercept']['stat'].values,
                              r0['Intercept']['stat'].values, decimal=6)
    L = np.array([[0, 1, 0, 0, 0, 0],
                  [0, 0, 1, 0, 0, 0],
                  ])
    r1 = r.mv_test(hypotheses=[['Drug', L, None]])
    # DataFrame input
    r = _MultivariateOLS(endog, exog).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Drug', L, None]])
    assert_array_almost_equal(r1['Drug']['stat'].values,
                              r0['Drug']['stat'].values, decimal=6)
    # Numpy array input
    r = _MultivariateOLS(endog.values, exog.values).fit(method='svd')
    r1 = r.mv_test(hypotheses=[['Drug', L, None]])
    assert_array_almost_equal(r1['Drug']['stat'].values,
                              r0['Drug']['stat'].values, decimal=6)
def test_L_M_matrices_1D_array():
    mod = _MultivariateOLS.from_formula("Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted", data)
    r = mod.fit(method="svd")
    L = np.array([1, 0, 0, 0, 0, 0])
    assert_raises(ValueError, r.mv_test, hypotheses=[["Drug", L, None]])
    L = np.array([[1, 0, 0, 0, 0, 0]])
    M = np.array([1, 0, 0, 0, 0, 0])
    assert_raises(ValueError, r.mv_test, hypotheses=[["Drug", L, M]])
def test_L_M_matrices_1D_array():
    mod = _MultivariateOLS.from_formula(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
        data)
    r = mod.fit(method='svd')
    L = np.array([1, 0, 0, 0, 0, 0])
    assert_raises(ValueError, r.mv_test, hypotheses=[['Drug', L, None]])
    L = np.array([[1, 0, 0, 0, 0, 0]])
    M = np.array([1, 0, 0, 0, 0, 0])
    assert_raises(ValueError, r.mv_test, hypotheses=[['Drug', L, M]])
def test_exog_1D_array():
    mod = _MultivariateOLS.from_formula(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ 0 + Depleted',
        data)
    r = mod.fit(method='svd')
    r0 = r.mv_test()
    a = [[0.0019, 8.0000, 20.0000, 55.0013, 0.0000],
         [1.8112, 8.0000, 22.0000, 26.3796, 0.0000],
         [97.8858, 8.0000, 12.1818, 117.1133, 0.0000],
         [93.2742, 4.0000, 11.0000, 256.5041, 0.0000]]
    assert_array_almost_equal(r0['Depleted']['stat'].values, a, decimal=4)
def test_exog_1D_array():
    mod = _MultivariateOLS.from_formula("Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ 0 + Depleted", data)
    r = mod.fit(method="svd")
    r0 = r.mv_test()
    a = [
        [0.0019, 8.0000, 20.0000, 55.0013, 0.0000],
        [1.8112, 8.0000, 22.0000, 26.3796, 0.0000],
        [97.8858, 8.0000, 12.1818, 117.1133, 0.0000],
        [93.2742, 4.0000, 11.0000, 256.5041, 0.0000],
    ]
    assert_array_almost_equal(r0["Depleted"]["stat"].values, a, decimal=4)
def test_specify_L_M_by_string():
    mod = _MultivariateOLS.from_formula(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
        data)
    r = mod.fit()
    r1 = r.mv_test(hypotheses=[['Intercept', ['Intercept'], None]])
    a = [[2.68607660e-02, 4, 6, 5.43435304e+01, 7.59585610e-05],
         [9.73139234e-01, 4, 6, 5.43435304e+01, 7.59585610e-05],
         [3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05],
         [3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05]]
    assert_array_almost_equal(r1['Intercept']['stat'].values, a, decimal=6)
    L = ['Intercept', 'Drug[T.Trimethaphan]', 'Drug[T.placebo]']
    M = ['Histamine1', 'Histamine3', 'Histamine5']
    r1 = r.mv_test(hypotheses=[['a', L, M]])
    a = [[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]]
    assert_array_almost_equal(r1['a']['contrast_L'], a, decimal=10)
    a = [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
    assert_array_almost_equal(r1['a']['transform_M'].T, a, decimal=10)
def test_specify_L_M_by_string():
    mod = _MultivariateOLS.from_formula("Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted", data)
    r = mod.fit()
    r1 = r.mv_test(hypotheses=[["Intercept", ["Intercept"], None]])
    a = [
        [2.68607660e-02, 4, 6, 5.43435304e01, 7.59585610e-05],
        [9.73139234e-01, 4, 6, 5.43435304e01, 7.59585610e-05],
        [3.62290202e01, 4, 6, 5.43435304e01, 7.59585610e-05],
        [3.62290202e01, 4, 6, 5.43435304e01, 7.59585610e-05],
    ]
    assert_array_almost_equal(r1["Intercept"]["stat"].values, a, decimal=6)
    L = ["Intercept", "Drug[T.Trimethaphan]", "Drug[T.placebo]"]
    M = ["Histamine1", "Histamine3", "Histamine5"]
    r1 = r.mv_test(hypotheses=[["a", L, M]])
    a = [[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]]
    assert_array_almost_equal(r1["a"]["contrast_L"], a, decimal=10)
    a = [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
    assert_array_almost_equal(r1["a"]["transform_M"].T, a, decimal=10)
def compare_r_output_dogs_data(method):
    """ Testing within-subject effect interact with 2 between-subject effect
    Compares with R car library Anova(, type=3) output

    Note: The test statistis Phillai, Wilks, Hotelling-Lawley
          and Roy are the same as R output but the approximate F and degree
          of freedoms can be different. This is due to the fact that this
          implementation is based on SAS formula [1]

    .. [1] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
    """

    # Repeated measures with orthogonal polynomial contrasts coding
    mod = _MultivariateOLS.from_formula("Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted", data)
    r = mod.fit(method=method)
    r = r.mv_test()
    a = [
        [2.68607660e-02, 4, 6, 5.43435304e01, 7.59585610e-05],
        [9.73139234e-01, 4, 6, 5.43435304e01, 7.59585610e-05],
        [3.62290202e01, 4, 6, 5.43435304e01, 7.59585610e-05],
        [3.62290202e01, 4, 6, 5.43435304e01, 7.59585610e-05],
    ]
    assert_array_almost_equal(r["Intercept"]["stat"].values, a, decimal=6)
    a = [
        [8.39646619e-02, 8, 1.20000000e01, 3.67658068e00, 2.12614444e-02],
        [1.18605382e00, 8, 1.40000000e01, 2.55003861e00, 6.01270701e-02],
        [7.69391362e00, 8, 6.63157895e00, 5.50814270e00, 2.07392260e-02],
        [7.25036952e00, 4, 7.00000000e00, 1.26881467e01, 2.52669877e-03],
    ]
    assert_array_almost_equal(r["Drug"]["stat"].values, a, decimal=6)
    a = [
        [0.32048892, 4.0, 6.0, 3.18034906, 0.10002373],
        [0.67951108, 4.0, 6.0, 3.18034906, 0.10002373],
        [2.12023271, 4.0, 6.0, 3.18034906, 0.10002373],
        [2.12023271, 4.0, 6.0, 3.18034906, 0.10002373],
    ]
    assert_array_almost_equal(r["Depleted"]["stat"].values, a, decimal=6)
    a = [
        [0.15234366, 8.0, 12.0, 2.34307678, 0.08894239],
        [1.13013353, 8.0, 14.0, 2.27360606, 0.08553213],
        [3.70989596, 8.0, 6.63157895, 2.65594824, 0.11370285],
        [3.1145597, 4.0, 7.0, 5.45047947, 0.02582767],
    ]
    assert_array_almost_equal(r["Drug:Depleted"]["stat"].values, a, decimal=6)
def test_affine_hypothesis():
    """ Testing affine hypothesis, compared with R car linearHypothesis
    Note: The test statistis Phillai, Wilks, Hotelling-Lawley
    and Roy are the same as R output but the approximate F and degree
    of freedoms can be different. This is due to the fact that this
    implementation is based on SAS formula [1]
    """
    mod = _MultivariateOLS.from_formula("Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted", data)
    r = mod.fit(method="svd")
    L = np.array([[0, 1.2, 1.1, 1.3, 1.5, 1.4], [0, 3.2, 2.1, 3.3, 5.5, 4.4]])
    M = None
    C = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
    r0 = r.mv_test(hypotheses=[("test1", L, M, C)])
    a = [
        [0.0269, 8.0000, 12.0000, 7.6441, 0.0010],
        [1.4277, 8.0000, 14.0000, 4.3657, 0.0080],
        [19.2678, 8.0000, 6.6316, 13.7940, 0.0016],
        [18.3470, 4.0000, 7.0000, 32.1072, 0.0001],
    ]
    assert_array_almost_equal(r0["test1"]["stat"].values, a, decimal=4)
    r0.summary(show_contrast_L=True, show_transform_M=True, show_constant_C=True)
def test_affine_hypothesis():
    # Testing affine hypothesis, compared with R car linearHypothesis
    # Note: The test statistis Phillai, Wilks, Hotelling-Lawley
    # and Roy are the same as R output but the approximate F and degree
    # of freedoms can be different. This is due to the fact that this
    # implementation is based on SAS formula [1]
    mod = _MultivariateOLS.from_formula(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
        data)
    r = mod.fit(method='svd')
    L = np.array([[0, 1.2, 1.1, 1.3, 1.5, 1.4], [0, 3.2, 2.1, 3.3, 5.5, 4.4]])
    M = None
    C = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
    r0 = r.mv_test(hypotheses=[('test1', L, M, C)])
    a = [[0.0269, 8.0000, 12.0000, 7.6441, 0.0010],
         [1.4277, 8.0000, 14.0000, 4.3657, 0.0080],
         [19.2678, 8.0000, 6.6316, 13.7940, 0.0016],
         [18.3470, 4.0000, 7.0000, 32.1072, 0.0001]]
    assert_array_almost_equal(r0['test1']['stat'].values, a, decimal=4)
    r0.summary(show_contrast_L=True,
               show_transform_M=True,
               show_constant_C=True)
def test_specify_L_M_by_string():
    mod = _MultivariateOLS.from_formula(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
        data)
    r = mod.fit()
    r1 = r.mv_test(hypotheses=[['Intercept', ['Intercept'], None]])
    a = [[2.68607660e-02, 4, 6, 5.43435304e+01, 7.59585610e-05],
         [9.73139234e-01, 4, 6, 5.43435304e+01, 7.59585610e-05],
         [3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05],
         [3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05]]
    assert_array_almost_equal(r1['Intercept']['stat'].values, a, decimal=6)
    L = ['Intercept', 'Drug[T.Trimethaphan]', 'Drug[T.placebo]']
    M = ['Histamine1', 'Histamine3', 'Histamine5']
    r1 = r.mv_test(hypotheses=[['a', L, M]])
    a = [[1, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0]]
    assert_array_almost_equal(r1['a']['contrast_L'], a, decimal=10)
    a = [[0, 1, 0, 0],
         [0, 0, 1, 0],
         [0, 0, 0, 1]]
    assert_array_almost_equal(r1['a']['transform_M'].T, a, decimal=10)
def compare_r_output_dogs_data(method):
    ''' Testing within-subject effect interact with 2 between-subject effect
    Compares with R car library Anova(, type=3) output

    Note: The test statistis Phillai, Wilks, Hotelling-Lawley
          and Roy are the same as R output but the approximate F and degree
          of freedoms can be different. This is due to the fact that this
          implementation is based on SAS formula [1]

    .. [1] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
    '''


    # Repeated measures with orthogonal polynomial contrasts coding
    mod = _MultivariateOLS.from_formula(
        'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
        data)
    r = mod.fit(method=method)
    r = r.mv_test()
    a = [[2.68607660e-02, 4, 6, 5.43435304e+01, 7.59585610e-05],
         [9.73139234e-01, 4, 6, 5.43435304e+01, 7.59585610e-05],
         [3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05],
         [3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05]]
    assert_array_almost_equal(r['Intercept']['stat'].values, a, decimal=6)
    a = [[8.39646619e-02, 8, 1.20000000e+01, 3.67658068e+00, 2.12614444e-02],
         [1.18605382e+00, 8, 1.40000000e+01, 2.55003861e+00, 6.01270701e-02],
         [7.69391362e+00, 8, 6.63157895e+00, 5.50814270e+00, 2.07392260e-02],
         [7.25036952e+00, 4, 7.00000000e+00, 1.26881467e+01, 2.52669877e-03]]
    assert_array_almost_equal(r['Drug']['stat'].values, a, decimal=6)
    a = [[0.32048892, 4., 6., 3.18034906, 0.10002373],
         [0.67951108, 4., 6., 3.18034906, 0.10002373],
         [2.12023271, 4., 6., 3.18034906, 0.10002373],
         [2.12023271, 4., 6., 3.18034906, 0.10002373]]
    assert_array_almost_equal(r['Depleted']['stat'].values, a, decimal=6)
    a = [[0.15234366, 8., 12.,        2.34307678, 0.08894239],
         [1.13013353, 8., 14.,        2.27360606, 0.08553213],
         [3.70989596, 8., 6.63157895, 2.65594824, 0.11370285],
         [3.1145597,  4., 7.,         5.45047947, 0.02582767]]
    assert_array_almost_equal(r['Drug:Depleted']['stat'].values, a, decimal=6)
from scipy import stats
from statsmodels.stats.multivariate import test_cov_oneway
from statsmodels.multivariate.multivariate_ols import _MultivariateOLS
import statsmodels.stats.oneway as smo
data = pd.read_csv(r"D:\书籍资料整理\多元统计分析\例2-1.csv")

#1.正态性检验
for key, value in data.iteritems():
    if key in ['省份', '城市']:
        continue
    key + ':statistic %s,pvalue:%s' % (shapiro(value)[0], shapiro(value)[1])
    #仅有人均地区生产总值和公共财政支出满足正态
#K-S不适合小样本,参见统计基础-假设检验

#2.多变量检验表证明省份对Y有影响
mod = _MultivariateOLS.from_formula('人均地区生产总值+公共财政支出 ~ 省份', data)
result = mod.fit(method='svd')
result.mv_test()
#3.多元统计-协方差阵检验

temp_data = []
temp_name = []
for name, group in data[['省份', '人均地区生产总值', '公共财政支出']].groupby(['省份']):
    temp_data.append(np.cov(np.asarray(group[['人均地区生产总值', '公共财政支出']].T)))
    temp_name.append(name)

#statistic_base 是Box's M统计量
#pvalue是书中给出的p值
test_cov_oneway(temp_data, [5, 5, 5])

#4.误差方差分析