Ejemplo n.º 1
0
def test_chi2_contingency_g():
    c = np.array([[15, 60], [15, 90]])
    g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False)
    assert_allclose(g, 2*xlogy(c, c/e).sum())

    g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True)
    c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
    assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())

    c = np.array([[10, 12, 10], [12, 10, 10]])
    g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
    assert_allclose(g, 2*xlogy(c, c/e).sum())
Ejemplo n.º 2
0
def test_chi2_contingency_g():
    c = np.array([[15, 60], [15, 90]])
    g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False)
    assert_allclose(g, 2*xlogy(c, c/e).sum())

    g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True)
    c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
    assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())

    c = np.array([[10, 12, 10], [12, 10, 10]])
    g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
    assert_allclose(g, 2*xlogy(c, c/e).sum())
Ejemplo n.º 3
0
def test_chi2_contingency_trivial():
    """Some very simple tests for chi2_contingency."""
    # A trivial case
    obs = np.array([[1, 2], [1, 2]])
    chi2, p, dof, expected = chi2_contingency(obs, correction=False)
    assert_equal(chi2, 0.0)
    assert_equal(p, 1.0)
    assert_equal(dof, 1)
    assert_array_equal(obs, expected)

    # A *really* trivial case: 1-D data.
    obs = np.array([1, 2, 3])
    chi2, p, dof, expected = chi2_contingency(obs, correction=False)
    assert_equal(chi2, 0.0)
    assert_equal(p, 1.0)
    assert_equal(dof, 0)
    assert_array_equal(obs, expected)
def test_chi2_contingency_trivial():
    """Some very simple tests for chi2_contingency."""
    # A trivial case
    obs = np.array([[1, 2], [1, 2]])
    chi2, p, dof, expected = chi2_contingency(obs, correction=False)
    assert_equal(chi2, 0.0)
    assert_equal(p, 1.0)
    assert_equal(dof, 1)
    assert_array_equal(obs, expected)

    # A *really* trivial case: 1-D data.
    obs = np.array([1, 2, 3])
    chi2, p, dof, expected = chi2_contingency(obs, correction=False)
    assert_equal(chi2, 0.0)
    assert_equal(p, 1.0)
    assert_equal(dof, 0)
    assert_array_equal(obs, expected)
Ejemplo n.º 5
0
def independentChi(A, B, alpha=0.05):

    contingency = getContingency(A, B)

    res = chi2_contingency(contingency, correction=False)
    print(res)
    # A and B are independant if true
    return res[1] < alpha
Ejemplo n.º 6
0
def independentG(A, B, alpha=0.05):

    contingency = getContingency(A, B)

    res = chi2_contingency(contingency,
                           correction=False,
                           lambda_="log-likelihood")
    print(res)

    # A and B are independant if true
    return res[1] < alpha
Ejemplo n.º 7
0
def Analysis(rdata):
    df = pd.DataFrame(rdata) # 데이터를 DataFrame으로 바꿔서 변수에 저장.
    #df.dropna() # 노파심에 함.
    #print(df)
    df['genNum'] = df['gender'].apply(lambda g:1 if g == '남' else 2) # 칼럼 추가. genNum이라는 컬럼을 추가하고 남자일 때 1, 아닐 때 2.
    df['coNum'] = df['co_survey'].apply(lambda c:1 if c == '스타벅스' else 2 # 칼럼 추가. coNum이라는 컬럼을 추가하고 스타벅스 일 때 1, 커피빈일 때 2, 이디아일 때 3, 탐앤탐스 일 때는 4.
                                        if c == '커피빈' else 3
                                        if c == '이디아' else 4)
    #print(df)
    
    crosstal = pd.crosstab(index = df['genNum'], columns = df['co_survey']) # 인덱스가 열
    #print(crosstal)
    #st, pv, _, _ = chi2_contingency(crosstal) # 이 방법이나 밑에 방법이나 똑같음.
    st, pv, _, _ = chi2_contingency((df['genNum'], df['coNum']))
    #print("통계값 : {}, p-value : {}".format(st, pv))
    
    if pv > 0.05:
        result = "<b>p 값이  {}</b>이므로 유의수준 0.05보다 커 <b>귀무가설을 채택</b><br>(성별에 따라 선호하는 커피 브랜드에는 차이가 없다)".format(pv)
    else:
        result = "<b>p 값이  {}</b>이므로 유의수준 0.05보다 작아 <b>귀무가설을 기각</b><br>(성별에 따라 선호하는 커피 브랜드에는 차이가 있다)".format(pv)

    return crosstal, result
Ejemplo n.º 8
0
def test_chi2_contingency_R():
    # Some test cases that were computed independently, using R.

    # Rcode = \
    # """
    # # Data vector.
    # data <- c(
    #   12, 34, 23,     4,  47,  11,
    #   35, 31, 11,    34,  10,  18,
    #   12, 32,  9,    18,  13,  19,
    #   12, 12, 14,     9,  33,  25
    #   )
    #
    # # Create factor tags:r=rows, c=columns, t=tiers
    # r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
    # c <- factor(gl(3, 1,   2*3*4, labels=c("c1", "c2", "c3")))
    # t <- factor(gl(2, 3,   2*3*4, labels=c("t1", "t2")))
    #
    # # 3-way Chi squared test of independence
    # s = summary(xtabs(data~r+c+t))
    # print(s)
    # """
    # Routput = \
    # """
    # Call: xtabs(formula = data ~ r + c + t)
    # Number of cases in table: 478
    # Number of factors: 3
    # Test for independence of all factors:
    #         Chisq = 102.17, df = 17, p-value = 3.514e-14
    # """
    obs = np.array(
        [[[12, 34, 23],
          [35, 31, 11],
          [12, 32, 9],
          [12, 12, 14]],
         [[4, 47, 11],
          [34, 10, 18],
          [18, 13, 19],
          [9, 33, 25]]])
    chi2, p, dof, expected = chi2_contingency(obs)
    assert_approx_equal(chi2, 102.17, significant=5)
    assert_approx_equal(p, 3.514e-14, significant=4)
    assert_equal(dof, 17)

    # Rcode = \
    # """
    # # Data vector.
    # data <- c(
    #     #
    #     12, 17,
    #     11, 16,
    #     #
    #     11, 12,
    #     15, 16,
    #     #
    #     23, 15,
    #     30, 22,
    #     #
    #     14, 17,
    #     15, 16
    #     )
    #
    # # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
    # r <- factor(gl(2, 2,  2*2*2*2, labels=c("r1", "r2")))
    # c <- factor(gl(2, 1,  2*2*2*2, labels=c("c1", "c2")))
    # d <- factor(gl(2, 4,  2*2*2*2, labels=c("d1", "d2")))
    # t <- factor(gl(2, 8,  2*2*2*2, labels=c("t1", "t2")))
    #
    # # 4-way Chi squared test of independence
    # s = summary(xtabs(data~r+c+d+t))
    # print(s)
    # """
    # Routput = \
    # """
    # Call: xtabs(formula = data ~ r + c + d + t)
    # Number of cases in table: 262
    # Number of factors: 4
    # Test for independence of all factors:
    #         Chisq = 8.758, df = 11, p-value = 0.6442
    # """
    obs = np.array(
        [[[[12, 17],
           [11, 16]],
          [[11, 12],
           [15, 16]]],
         [[[23, 15],
           [30, 22]],
          [[14, 17],
           [15, 16]]]])
    chi2, p, dof, expected = chi2_contingency(obs)
    assert_approx_equal(chi2, 8.758, significant=4)
    assert_approx_equal(p, 0.6442, significant=4)
    assert_equal(dof, 11)
Ejemplo n.º 9
0
import pandas as pd
import numpy as np
import scipy
from scipy.stats.contingency import chi2_contingency

df1 = pd.read_csv("C:/Users/Aaron Korver/Desktop/AnalyseCijfers2.csv")
df1['trip_purpose'] = df1['trip_purpose'].replace(['betaaldwerk', 'dagelijkeboodschappen', 'diensten','niet-dagelijkeboodschappen','studie'], '1')
df1['trip_purpose'] = df1['trip_purpose'].replace(['recreatie', 'sociaal', 'vrijetijd','home'], '0')

contingency = pd.crosstab(df1['BG2010NameDest'], df1['trip_purpose'])

contingency.to_csv("C:/Users/Aaron Korver/Desktop/ChiSquareOrig3.csv")
chi2, p, dof, expected = chi2_contingency(contingency)
print chi2, p, dof
Ejemplo n.º 10
0
def test_independence(data, colX, colY):
    X = data[colX].astype(str)
    Y = data[colY].astype(str)
    observed = pd.crosstab(Y, X)
    chi2, p, dof, expected = cn.chi2_contingency(observed.values)
    return p
Ejemplo n.º 11
0
"""
Created on Wed May  8 13:02:49 2019

@author: mfatemeh
"""

import pandas as pd

df = pd.read_csv('titanic_train.csv')
df.isnull().sum()

observed_contigency_table = pd.crosstab(df.Survived, df.Pclass)

from scipy.stats.contingency import chi2_contingency

chi_2, p_val, dof, expected_contigency_table = chi2_contingency(
    observed_contigency_table)
#the frredom ddegree is 2 because 3-1 =2 * 1 =2
#p_value is vey low therefore the H_0 is regected meaning not(they are independent); therefore, they are dependent(correlated)

observed_contigency_table2 = pd.crosstab(df.Survived, df.Sex)

chi_2_G, p_val, dof_G, expected_contigency_table_G = chi2_contingency(
    observed_contigency_table2)
# if p_value is less than alpha (ass)

alpha = 0.05
if p_val < alpha:
    print('The varianles are correlated at significant level', alpha)
else:
    print('The variables are independent at signifiacant level', alpha)
Ejemplo n.º 12
0
###--- dropping unnecessary feature
df = df.drop(['# Columns: time'], axis=1)

###--- seperation of dependent and independent
X = df.iloc[:, 2:7].values
y = df.iloc[:, -1].values

###---correlation nalysis

corr = df.corr()
sns.heatmap(abs(corr), annot=True)

from scipy.stats.contingency import chi2_contingency
for i in range(len(X[0]) + 1):
    contigency_table = pd.crosstab(y, X[:, i - 1])
    chi_2, p_val, dof, expected_val = chi2_contingency(contigency_table)
    # if p_value is less than alpha (pass)
    alpha = 0.05
    if p_val < alpha:
        print('The varianles are correlated at significant level', alpha)
    else:
        print('The variables are independent at signifiacant level', alpha)

###-----encoding and labling
df.detected_activity = df.detected_activity.map({
    'bending_1': 1,
    'bending_2': 2,
    'cycling': 3,
    'lying': 4,
    'sitting': 5,
    'standing': 6,
Ejemplo n.º 13
0
def test_chi2_contingency_R():
    # Some test cases that were computed independently, using R.

    # Rcode = \
    # """
    # # Data vector.
    # data <- c(
    #   12, 34, 23,     4,  47,  11,
    #   35, 31, 11,    34,  10,  18,
    #   12, 32,  9,    18,  13,  19,
    #   12, 12, 14,     9,  33,  25
    #   )
    #
    # # Create factor tags:r=rows, c=columns, t=tiers
    # r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
    # c <- factor(gl(3, 1,   2*3*4, labels=c("c1", "c2", "c3")))
    # t <- factor(gl(2, 3,   2*3*4, labels=c("t1", "t2")))
    #
    # # 3-way Chi squared test of independence
    # s = summary(xtabs(data~r+c+t))
    # print(s)
    # """
    # Routput = \
    # """
    # Call: xtabs(formula = data ~ r + c + t)
    # Number of cases in table: 478
    # Number of factors: 3
    # Test for independence of all factors:
    #         Chisq = 102.17, df = 17, p-value = 3.514e-14
    # """
    obs = np.array([[[12, 34, 23], [35, 31, 11], [12, 32, 9], [12, 12, 14]],
                    [[4, 47, 11], [34, 10, 18], [18, 13, 19], [9, 33, 25]]])
    chi2, p, dof, expected = chi2_contingency(obs)
    assert_approx_equal(chi2, 102.17, significant=5)
    assert_approx_equal(p, 3.514e-14, significant=4)
    assert_equal(dof, 17)

    # Rcode = \
    # """
    # # Data vector.
    # data <- c(
    #     #
    #     12, 17,
    #     11, 16,
    #     #
    #     11, 12,
    #     15, 16,
    #     #
    #     23, 15,
    #     30, 22,
    #     #
    #     14, 17,
    #     15, 16
    #     )
    #
    # # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
    # r <- factor(gl(2, 2,  2*2*2*2, labels=c("r1", "r2")))
    # c <- factor(gl(2, 1,  2*2*2*2, labels=c("c1", "c2")))
    # d <- factor(gl(2, 4,  2*2*2*2, labels=c("d1", "d2")))
    # t <- factor(gl(2, 8,  2*2*2*2, labels=c("t1", "t2")))
    #
    # # 4-way Chi squared test of independence
    # s = summary(xtabs(data~r+c+d+t))
    # print(s)
    # """
    # Routput = \
    # """
    # Call: xtabs(formula = data ~ r + c + d + t)
    # Number of cases in table: 262
    # Number of factors: 4
    # Test for independence of all factors:
    #         Chisq = 8.758, df = 11, p-value = 0.6442
    # """
    obs = np.array([[[[12, 17], [11, 16]], [[11, 12], [15, 16]]],
                    [[[23, 15], [30, 22]], [[14, 17], [15, 16]]]])
    chi2, p, dof, expected = chi2_contingency(obs)
    assert_approx_equal(chi2, 8.758, significant=4)
    assert_approx_equal(p, 0.6442, significant=4)
    assert_equal(dof, 11)

    # test Yates' correction (unbalanced case)
    # > X <- matrix(c(1000, 10, 1, 0), ncol=2, nrow=2)
    # > X
    #     [,1] [,2]
    # [1,] 1000    1
    # [2,]   10    0
    # > chisq.test(X)
    #         Pearson's Chi-squared test with Yates' continuity correction
    # data:  X
    # X-squared = 5.163e-28, df = 1, p-value = 1

    obs = np.array([[1000, 1], [10, 0]])
    chi2, p, dof, expected = chi2_contingency(obs, correction=True)
    assert_approx_equal(p, 1.0, significant=2)
    assert_allclose(chi2, 0.0)
Ejemplo n.º 14
0
def test_result(correction):
    obs = np.array([[1, 2], [1, 2]])
    res = chi2_contingency(obs, correction=correction)
    assert_equal((res.statistic, res.pvalue, res.dof, res.expected_freq), res)
Ejemplo n.º 15
0
Created on Wed May  8 13:04:22 2019

@author: gsaikia
"""

import pandas as pd

df = pd.read_csv('titanic_train.csv')

df.isnull().sum()

observed_contigency_table = pd.crosstab(df.Survived, df.Sex)

from scipy.stats.contingency import chi2_contingency

chi_2, p_val, dof, expected_contingency_table = chi2_contingency(
    observed_contigency_table)

alpha = 0.05

if p_val < alpha:
    print('The variables are correlated at signicance level', alpha)
else:
    print(
        'We fail to reject that the variables are independent at signicance level',
        alpha)
'''
c_t = pd.DataFrame([[250,200],[50,1000]],columns=['Plays Chess','Doesnt Play Chess'],
                   index=['Likes Science Fiction','Doesnt Like Science Fiction'])

chi2_contingency(c_t)
'''
def test_chi2_contingency_R():
    """Some test cases that were computed independently, using R."""

    Rcode = \
    """
    # Data vector.
    data <- c(
      12, 34, 23,     4,  47,  11,
      35, 31, 11,    34,  10,  18,
      12, 32,  9,    18,  13,  19,
      12, 12, 14,     9,  33,  25
      )

    # Create factor tags:r=rows, c=columns, t=tiers
    r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
    c <- factor(gl(3, 1,   2*3*4, labels=c("c1", "c2", "c3")))
    t <- factor(gl(2, 3,   2*3*4, labels=c("t1", "t2")))

    # 3-way Chi squared test of independence
    s = summary(xtabs(data~r+c+t))
    print(s)
    """
    Routput = \
    """
    Call: xtabs(formula = data ~ r + c + t)
    Number of cases in table: 478
    Number of factors: 3
    Test for independence of all factors:
            Chisq = 102.17, df = 17, p-value = 3.514e-14
    """
    obs = np.array(
        [[[12, 34, 23],
          [35, 31, 11],
          [12, 32, 9],
          [12, 12, 14]],
         [[4, 47, 11],
          [34, 10, 18],
          [18, 13, 19],
          [9, 33, 25]]])
    chi2, p, dof, expected = chi2_contingency(obs)
    assert_approx_equal(chi2, 102.17, significant=5)
    assert_approx_equal(p, 3.514e-14, significant=4)
    assert_equal(dof, 17)

    Rcode = \
    """
    # Data vector.
    data <- c(
        #
        12, 17,
        11, 16,
        #
        11, 12,
        15, 16,
        #
        23, 15,
        30, 22,
        #
        14, 17,
        15, 16
        )

    # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
    r <- factor(gl(2, 2,  2*2*2*2, labels=c("r1", "r2")))
    c <- factor(gl(2, 1,  2*2*2*2, labels=c("c1", "c2")))
    d <- factor(gl(2, 4,  2*2*2*2, labels=c("d1", "d2")))
    t <- factor(gl(2, 8,  2*2*2*2, labels=c("t1", "t2")))

    # 4-way Chi squared test of independence
    s = summary(xtabs(data~r+c+d+t))
    print(s)
    """
    Routput = \
    """
    Call: xtabs(formula = data ~ r + c + d + t)
    Number of cases in table: 262
    Number of factors: 4
    Test for independence of all factors:
            Chisq = 8.758, df = 11, p-value = 0.6442
    """
    obs = np.array(
        [[[[12, 17],
           [11, 16]],
          [[11, 12],
           [15, 16]]],
         [[[23, 15],
           [30, 22]],
          [[14, 17],
           [15, 16]]]])
    chi2, p, dof, expected = chi2_contingency(obs)
    assert_approx_equal(chi2, 8.758, significant=4)
    assert_approx_equal(p, 0.6442, significant=4)
    assert_equal(dof, 11)
Ejemplo n.º 17
0
def test_chi2_contingency_yates_gh13875():
    # Magnitude of Yates' continuity correction should not exceed difference
    # between expected and observed value of the statistic; see gh-13875
    observed = np.array([[1573, 3], [4, 0]])
    p = chi2_contingency(observed)[1]
    assert_allclose(p, 1, rtol=1e-12)