def test_chi2_contingency_g(): c = np.array([[15, 60], [15, 90]]) g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=False) assert_allclose(g, 2*xlogy(c, c/e).sum()) g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood', correction=True) c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]]) assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum()) c = np.array([[10, 12, 10], [12, 10, 10]]) g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood') assert_allclose(g, 2*xlogy(c, c/e).sum())
def test_chi2_contingency_trivial(): """Some very simple tests for chi2_contingency.""" # A trivial case obs = np.array([[1, 2], [1, 2]]) chi2, p, dof, expected = chi2_contingency(obs, correction=False) assert_equal(chi2, 0.0) assert_equal(p, 1.0) assert_equal(dof, 1) assert_array_equal(obs, expected) # A *really* trivial case: 1-D data. obs = np.array([1, 2, 3]) chi2, p, dof, expected = chi2_contingency(obs, correction=False) assert_equal(chi2, 0.0) assert_equal(p, 1.0) assert_equal(dof, 0) assert_array_equal(obs, expected)
def independentChi(A, B, alpha=0.05): contingency = getContingency(A, B) res = chi2_contingency(contingency, correction=False) print(res) # A and B are independant if true return res[1] < alpha
def independentG(A, B, alpha=0.05): contingency = getContingency(A, B) res = chi2_contingency(contingency, correction=False, lambda_="log-likelihood") print(res) # A and B are independant if true return res[1] < alpha
def Analysis(rdata): df = pd.DataFrame(rdata) # 데이터를 DataFrame으로 바꿔서 변수에 저장. #df.dropna() # 노파심에 함. #print(df) df['genNum'] = df['gender'].apply(lambda g:1 if g == '남' else 2) # 칼럼 추가. genNum이라는 컬럼을 추가하고 남자일 때 1, 아닐 때 2. df['coNum'] = df['co_survey'].apply(lambda c:1 if c == '스타벅스' else 2 # 칼럼 추가. coNum이라는 컬럼을 추가하고 스타벅스 일 때 1, 커피빈일 때 2, 이디아일 때 3, 탐앤탐스 일 때는 4. if c == '커피빈' else 3 if c == '이디아' else 4) #print(df) crosstal = pd.crosstab(index = df['genNum'], columns = df['co_survey']) # 인덱스가 열 #print(crosstal) #st, pv, _, _ = chi2_contingency(crosstal) # 이 방법이나 밑에 방법이나 똑같음. st, pv, _, _ = chi2_contingency((df['genNum'], df['coNum'])) #print("통계값 : {}, p-value : {}".format(st, pv)) if pv > 0.05: result = "<b>p 값이 {}</b>이므로 유의수준 0.05보다 커 <b>귀무가설을 채택</b><br>(성별에 따라 선호하는 커피 브랜드에는 차이가 없다)".format(pv) else: result = "<b>p 값이 {}</b>이므로 유의수준 0.05보다 작아 <b>귀무가설을 기각</b><br>(성별에 따라 선호하는 커피 브랜드에는 차이가 있다)".format(pv) return crosstal, result
def test_chi2_contingency_R(): # Some test cases that were computed independently, using R. # Rcode = \ # """ # # Data vector. # data <- c( # 12, 34, 23, 4, 47, 11, # 35, 31, 11, 34, 10, 18, # 12, 32, 9, 18, 13, 19, # 12, 12, 14, 9, 33, 25 # ) # # # Create factor tags:r=rows, c=columns, t=tiers # r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4"))) # c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3"))) # t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2"))) # # # 3-way Chi squared test of independence # s = summary(xtabs(data~r+c+t)) # print(s) # """ # Routput = \ # """ # Call: xtabs(formula = data ~ r + c + t) # Number of cases in table: 478 # Number of factors: 3 # Test for independence of all factors: # Chisq = 102.17, df = 17, p-value = 3.514e-14 # """ obs = np.array( [[[12, 34, 23], [35, 31, 11], [12, 32, 9], [12, 12, 14]], [[4, 47, 11], [34, 10, 18], [18, 13, 19], [9, 33, 25]]]) chi2, p, dof, expected = chi2_contingency(obs) assert_approx_equal(chi2, 102.17, significant=5) assert_approx_equal(p, 3.514e-14, significant=4) assert_equal(dof, 17) # Rcode = \ # """ # # Data vector. # data <- c( # # # 12, 17, # 11, 16, # # # 11, 12, # 15, 16, # # # 23, 15, # 30, 22, # # # 14, 17, # 15, 16 # ) # # # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers # r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2"))) # c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2"))) # d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2"))) # t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2"))) # # # 4-way Chi squared test of independence # s = summary(xtabs(data~r+c+d+t)) # print(s) # """ # Routput = \ # """ # Call: xtabs(formula = data ~ r + c + d + t) # Number of cases in table: 262 # Number of factors: 4 # Test for independence of all factors: # Chisq = 8.758, df = 11, p-value = 0.6442 # """ obs = np.array( [[[[12, 17], [11, 16]], [[11, 12], [15, 16]]], [[[23, 15], [30, 22]], [[14, 17], [15, 16]]]]) chi2, p, dof, expected = chi2_contingency(obs) assert_approx_equal(chi2, 8.758, significant=4) assert_approx_equal(p, 0.6442, significant=4) assert_equal(dof, 11)
import pandas as pd import numpy as np import scipy from scipy.stats.contingency import chi2_contingency df1 = pd.read_csv("C:/Users/Aaron Korver/Desktop/AnalyseCijfers2.csv") df1['trip_purpose'] = df1['trip_purpose'].replace(['betaaldwerk', 'dagelijkeboodschappen', 'diensten','niet-dagelijkeboodschappen','studie'], '1') df1['trip_purpose'] = df1['trip_purpose'].replace(['recreatie', 'sociaal', 'vrijetijd','home'], '0') contingency = pd.crosstab(df1['BG2010NameDest'], df1['trip_purpose']) contingency.to_csv("C:/Users/Aaron Korver/Desktop/ChiSquareOrig3.csv") chi2, p, dof, expected = chi2_contingency(contingency) print chi2, p, dof
def test_independence(data, colX, colY): X = data[colX].astype(str) Y = data[colY].astype(str) observed = pd.crosstab(Y, X) chi2, p, dof, expected = cn.chi2_contingency(observed.values) return p
""" Created on Wed May 8 13:02:49 2019 @author: mfatemeh """ import pandas as pd df = pd.read_csv('titanic_train.csv') df.isnull().sum() observed_contigency_table = pd.crosstab(df.Survived, df.Pclass) from scipy.stats.contingency import chi2_contingency chi_2, p_val, dof, expected_contigency_table = chi2_contingency( observed_contigency_table) #the frredom ddegree is 2 because 3-1 =2 * 1 =2 #p_value is vey low therefore the H_0 is regected meaning not(they are independent); therefore, they are dependent(correlated) observed_contigency_table2 = pd.crosstab(df.Survived, df.Sex) chi_2_G, p_val, dof_G, expected_contigency_table_G = chi2_contingency( observed_contigency_table2) # if p_value is less than alpha (ass) alpha = 0.05 if p_val < alpha: print('The varianles are correlated at significant level', alpha) else: print('The variables are independent at signifiacant level', alpha)
###--- dropping unnecessary feature df = df.drop(['# Columns: time'], axis=1) ###--- seperation of dependent and independent X = df.iloc[:, 2:7].values y = df.iloc[:, -1].values ###---correlation nalysis corr = df.corr() sns.heatmap(abs(corr), annot=True) from scipy.stats.contingency import chi2_contingency for i in range(len(X[0]) + 1): contigency_table = pd.crosstab(y, X[:, i - 1]) chi_2, p_val, dof, expected_val = chi2_contingency(contigency_table) # if p_value is less than alpha (pass) alpha = 0.05 if p_val < alpha: print('The varianles are correlated at significant level', alpha) else: print('The variables are independent at signifiacant level', alpha) ###-----encoding and labling df.detected_activity = df.detected_activity.map({ 'bending_1': 1, 'bending_2': 2, 'cycling': 3, 'lying': 4, 'sitting': 5, 'standing': 6,
def test_chi2_contingency_R(): # Some test cases that were computed independently, using R. # Rcode = \ # """ # # Data vector. # data <- c( # 12, 34, 23, 4, 47, 11, # 35, 31, 11, 34, 10, 18, # 12, 32, 9, 18, 13, 19, # 12, 12, 14, 9, 33, 25 # ) # # # Create factor tags:r=rows, c=columns, t=tiers # r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4"))) # c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3"))) # t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2"))) # # # 3-way Chi squared test of independence # s = summary(xtabs(data~r+c+t)) # print(s) # """ # Routput = \ # """ # Call: xtabs(formula = data ~ r + c + t) # Number of cases in table: 478 # Number of factors: 3 # Test for independence of all factors: # Chisq = 102.17, df = 17, p-value = 3.514e-14 # """ obs = np.array([[[12, 34, 23], [35, 31, 11], [12, 32, 9], [12, 12, 14]], [[4, 47, 11], [34, 10, 18], [18, 13, 19], [9, 33, 25]]]) chi2, p, dof, expected = chi2_contingency(obs) assert_approx_equal(chi2, 102.17, significant=5) assert_approx_equal(p, 3.514e-14, significant=4) assert_equal(dof, 17) # Rcode = \ # """ # # Data vector. # data <- c( # # # 12, 17, # 11, 16, # # # 11, 12, # 15, 16, # # # 23, 15, # 30, 22, # # # 14, 17, # 15, 16 # ) # # # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers # r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2"))) # c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2"))) # d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2"))) # t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2"))) # # # 4-way Chi squared test of independence # s = summary(xtabs(data~r+c+d+t)) # print(s) # """ # Routput = \ # """ # Call: xtabs(formula = data ~ r + c + d + t) # Number of cases in table: 262 # Number of factors: 4 # Test for independence of all factors: # Chisq = 8.758, df = 11, p-value = 0.6442 # """ obs = np.array([[[[12, 17], [11, 16]], [[11, 12], [15, 16]]], [[[23, 15], [30, 22]], [[14, 17], [15, 16]]]]) chi2, p, dof, expected = chi2_contingency(obs) assert_approx_equal(chi2, 8.758, significant=4) assert_approx_equal(p, 0.6442, significant=4) assert_equal(dof, 11) # test Yates' correction (unbalanced case) # > X <- matrix(c(1000, 10, 1, 0), ncol=2, nrow=2) # > X # [,1] [,2] # [1,] 1000 1 # [2,] 10 0 # > chisq.test(X) # Pearson's Chi-squared test with Yates' continuity correction # data: X # X-squared = 5.163e-28, df = 1, p-value = 1 obs = np.array([[1000, 1], [10, 0]]) chi2, p, dof, expected = chi2_contingency(obs, correction=True) assert_approx_equal(p, 1.0, significant=2) assert_allclose(chi2, 0.0)
def test_result(correction): obs = np.array([[1, 2], [1, 2]]) res = chi2_contingency(obs, correction=correction) assert_equal((res.statistic, res.pvalue, res.dof, res.expected_freq), res)
Created on Wed May 8 13:04:22 2019 @author: gsaikia """ import pandas as pd df = pd.read_csv('titanic_train.csv') df.isnull().sum() observed_contigency_table = pd.crosstab(df.Survived, df.Sex) from scipy.stats.contingency import chi2_contingency chi_2, p_val, dof, expected_contingency_table = chi2_contingency( observed_contigency_table) alpha = 0.05 if p_val < alpha: print('The variables are correlated at signicance level', alpha) else: print( 'We fail to reject that the variables are independent at signicance level', alpha) ''' c_t = pd.DataFrame([[250,200],[50,1000]],columns=['Plays Chess','Doesnt Play Chess'], index=['Likes Science Fiction','Doesnt Like Science Fiction']) chi2_contingency(c_t) '''
def test_chi2_contingency_R(): """Some test cases that were computed independently, using R.""" Rcode = \ """ # Data vector. data <- c( 12, 34, 23, 4, 47, 11, 35, 31, 11, 34, 10, 18, 12, 32, 9, 18, 13, 19, 12, 12, 14, 9, 33, 25 ) # Create factor tags:r=rows, c=columns, t=tiers r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4"))) c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3"))) t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2"))) # 3-way Chi squared test of independence s = summary(xtabs(data~r+c+t)) print(s) """ Routput = \ """ Call: xtabs(formula = data ~ r + c + t) Number of cases in table: 478 Number of factors: 3 Test for independence of all factors: Chisq = 102.17, df = 17, p-value = 3.514e-14 """ obs = np.array( [[[12, 34, 23], [35, 31, 11], [12, 32, 9], [12, 12, 14]], [[4, 47, 11], [34, 10, 18], [18, 13, 19], [9, 33, 25]]]) chi2, p, dof, expected = chi2_contingency(obs) assert_approx_equal(chi2, 102.17, significant=5) assert_approx_equal(p, 3.514e-14, significant=4) assert_equal(dof, 17) Rcode = \ """ # Data vector. data <- c( # 12, 17, 11, 16, # 11, 12, 15, 16, # 23, 15, 30, 22, # 14, 17, 15, 16 ) # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2"))) c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2"))) d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2"))) t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2"))) # 4-way Chi squared test of independence s = summary(xtabs(data~r+c+d+t)) print(s) """ Routput = \ """ Call: xtabs(formula = data ~ r + c + d + t) Number of cases in table: 262 Number of factors: 4 Test for independence of all factors: Chisq = 8.758, df = 11, p-value = 0.6442 """ obs = np.array( [[[[12, 17], [11, 16]], [[11, 12], [15, 16]]], [[[23, 15], [30, 22]], [[14, 17], [15, 16]]]]) chi2, p, dof, expected = chi2_contingency(obs) assert_approx_equal(chi2, 8.758, significant=4) assert_approx_equal(p, 0.6442, significant=4) assert_equal(dof, 11)
def test_chi2_contingency_yates_gh13875(): # Magnitude of Yates' continuity correction should not exceed difference # between expected and observed value of the statistic; see gh-13875 observed = np.array([[1573, 3], [4, 0]]) p = chi2_contingency(observed)[1] assert_allclose(p, 1, rtol=1e-12)