def test_chisquare_masked_arrays(): # The other tests were taken from the tests for stats.chisquare, so # they don't test the function with masked arrays. Here masked arrays # are tested. obs = np.array([[8, 8, 16, 32, -1], [-1, -1, 3, 4, 5]]).T mask = np.array([[0, 0, 0, 0, 1], [1, 1, 0, 0, 0]]).T mobs = ma.masked_array(obs, mask) expected_chisq = np.array([24.0, 0.5]) chisq, p = mstats.chisquare(mobs) assert_array_equal(chisq, expected_chisq) assert_array_almost_equal( p, stats.chisqprob(expected_chisq, mobs.count(axis=0) - 1)) chisq, p = mstats.chisquare(mobs.T, axis=1) assert_array_equal(chisq, expected_chisq) assert_array_almost_equal( p, stats.chisqprob(expected_chisq, mobs.T.count(axis=1) - 1)) # When axis=None, the two values should have type np.float64. chisq, p = mstats.chisquare([1, 2, 3], axis=None) assert_(isinstance(chisq, np.float64)) assert_(isinstance(p, np.float64)) assert_equal(chisq, 1.0) assert_almost_equal(p, stats.chisqprob(1.0, 2))
def corrolateAmps(self): cor = [] self.adjustPhase(self.exp_phis, self.sim_phis[0, 0:359]) #TODO: fix sim data to avoid using "-2" to ignore 2 thast 2 extra data rows for i in range(len(self.sim_amps) - 2): # adjust for amp magnitude difference between simulation and exp results bg_observed = self.exp_bgs bg_expected = self.sim_bgs[i, 0:359] max_sim_bg = bg_expected.max() max_exp_bg = bg_observed.max() bg_observed = (max_sim_bg / max_exp_bg) * bg_observed # adjust for amp magnitude difference between simulation and exp results amp_observed = self.exp_amps amp_expected = self.sim_amps[i, 0:359] max_sim_amp = amp_expected.max() max_exp_amp = amp_observed.max() amp_observed = (max_sim_amp / max_exp_amp) * amp_observed amp_chi = st.chisquare(amp_expected, amp_observed) #amp chisqaure bg_chi = st.chisquare(bg_expected, bg_observed) #bg chisqaure #TODO: fix cor.append(amp_chi + bg_chi) # append values amp and bg to array cor = np.array(cor) # cast to numpy array return cor[:, 0].argmin( ) # get location of minumuim chisquare in array (angle)
def check_chisquare(f_obs, f_exp, ddof, axis, expected_chi2): # Use this only for arrays that have no masked values. f_obs = np.asarray(f_obs) if axis is None: num_obs = f_obs.size else: if axis == 'no': use_axis = 0 else: use_axis = axis b = np.broadcast(f_obs, f_exp) num_obs = b.shape[use_axis] if axis == 'no': chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof) else: chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof, axis=axis) assert_array_equal(chi2, expected_chi2) ddof = np.asarray(ddof) expected_p = stats.chisqprob(expected_chi2, num_obs - 1 - ddof) assert_array_equal(p, expected_p) # Also compare to stats.chisquare if axis == 'no': stats_chisq, stats_p = stats.chisquare(f_obs, f_exp=f_exp, ddof=ddof) else: stats_chisq, stats_p = stats.chisquare(f_obs, f_exp=f_exp, ddof=ddof, axis=axis) assert_array_almost_equal(chi2, stats_chisq) assert_array_almost_equal(p, stats_p)
def corrolateAmps(self): cor=[] self.adjustPhase(self.exp_phis,self.sim_phis[0,0:359]) #TODO: fix sim data to avoid using "-2" to ignore 2 thast 2 extra data rows for i in range(len(self.sim_amps)-2): # adjust for amp magnitude difference between simulation and exp results bg_observed=self.exp_bgs bg_expected=self.sim_bgs[i,0:359] max_sim_bg=bg_expected.max() max_exp_bg=bg_observed.max() bg_observed=(max_sim_bg/max_exp_bg)*bg_observed # adjust for amp magnitude difference between simulation and exp results amp_observed=self.exp_amps amp_expected=self.sim_amps[i,0:359] max_sim_amp=amp_expected.max() max_exp_amp=amp_observed.max() amp_observed=(max_sim_amp/max_exp_amp)*amp_observed amp_chi=st.chisquare(amp_expected,amp_observed)#amp chisqaure bg_chi=st.chisquare(bg_expected,bg_observed) #bg chisqaure #TODO: fix cor.append(amp_chi+bg_chi) # append values amp and bg to array cor=np.array(cor)# cast to numpy array return cor[:,0].argmin() # get location of minumuim chisquare in array (angle)
def count_by_day_of_week(dates): """ takes a series of dates returns a series grouping by day of the week e.g. (day_of_week, count) 0 123 1 2 2 8 3 2 4 322 5 9 6 1 """ # carefull!!! 0tage nicht vergessen by_day = pd.DataFrame(dates.value_counts()) by_day.columns = ['entries'] by_day['day'] = by_day.index.map(lambda x: x.weekday()) mean_per_day = by_day['entries'].groupby(by_day['day']).mean() per_weekday = by_day['entries'].groupby(by_day['day']).sum() #chi square df = pd.DataFrame({'hits':per_weekday, 'dfreq':_get_adj_freqs(dates) }) df['adjusted_expectation'] = df.dfreq.map(lambda x: x * df.hits.sum() / df.dfreq.sum()) p = chisquare(df.hits, df.adjusted_expectation).pvalue return mean_per_day,p
def calculate_chi_square(self): expected_values = self.calculate_expected_values() stat_and_p_value = statistics.chisquare(self.observations, f_exp=expected_values) return stat_and_p_value
def chisq_independence(col1, col2): # print col1, col2 contingencyTable = pd.crosstab(col1, col2, margins=True) if len(col1) / ((contingencyTable.shape[0] - 1) * (contingencyTable.shape[1] - 1)) <= 5: return "TMC" expected = contingencyTable.copy() total = contingencyTable.loc["All", "All"] # print contingencyTable.index # print contingencyTable.columns for m in contingencyTable.index: for n in contingencyTable.columns: expected.loc[m, n] = contingencyTable.loc[ m, "All"] * contingencyTable.loc["All", n] / float(total) # print contingencyTable # print expected observed_frq = contingencyTable.iloc[:-1, :-1].values.ravel() expected_frq = expected.iloc[:-1, :-1].values.ravel() numless1 = len(expected_frq[expected_frq < 1]) perless5 = len(expected_frq[expected_frq < 5]) / len(expected_frq) #Adjustment in DOF so use the 1D chisquare to matrix shaped data; -1 in row n col because of All row and column matrixadj = (contingencyTable.shape[0] - 1) + (contingencyTable.shape[1] - 1) - 2 pval = np.round( chisquare(observed_frq, expected_frq, ddof=matrixadj)[1], 3) if numless1 > 0 or perless5 >= 0.2: return str(pval) + "*" else: return pval
def chi2Sam(self, sam, baseF, debug=False, nfcn=-1): ''' determine chi2 for sample given base function baseF Fit function specified by baseF to sample data to extract constant, then determine chisquare of fit ''' x, y = numpy.array([a for a, b in sam]), numpy.array([b for a, b in sam]) if debug: print 'adsorpMC.chi2Sam sam', sam, 'x', x, 'y', y self.baseF = baseF # do the fit [a0, covA] = curve_fit(self.func1, x, y) if debug: print 'adsorpMC.chis2Sam a0', a0 if self.plotFits: if nfcn <= 0: sys.exit('adsorpMC.chi2Sam ERROR nfcn=' + str(nfcn) + ' should be >0') xf = numpy.linspace(0., max(self.RefX), 1000) #self.Duration,100) yf = self.func1(xf, a0) self.theFits[nfcn] = [xf, yf] # evaluate chisquare observed = y expected = self.func1(x, a0) if debug: print 'adsorpMC.chis2Sam observed', observed if debug: print 'adsorpMC.chis2Sam expected', expected ndf = len(x) - 1 chisq, pvalue = chisquare(observed, expected, -1) if debug: print 'adsorpMC.chis2Sam chisq,pvalue,ndf', chisq, pvalue, ndf return chisq, ndf, pvalue
def chisq_independence(self, col1, col2, verbose = False): contingencyTable = pd.crosstab(col1,col2,margins=True) if len(col1)/((contingencyTable.shape[0] - 1) * (contingencyTable.shape[1] - 1)) <= 5: return "TMC" expected = contingencyTable.copy() total = contingencyTable.loc["All","All"] # print contingencyTable.index # print contingencyTable.columns for m in contingencyTable.index: for n in contingencyTable.columns: expected.loc[m,n] = contingencyTable.loc[m,"All"]*contingencyTable.loc["All",n]/float(total) if verbose: print '\n\nAnalysis of models: %s and %s' % (col1.name, col2.name) print 'Contingency Table:' print contingencyTable # print '\nExpected Frequency Table:' # print expected observed_frq = contingencyTable.iloc[:-1,:-1].values.ravel() expected_frq = expected.iloc[:-1,:-1].values.ravel() numless1 = len(expected_frq[expected_frq<1]) perless5 = len(expected_frq[expected_frq<5])/len(expected_frq) #Adjustment in DOF so use the 1D chisquare to matrix shaped data; -1 in row n col because of All row and column matrixadj = (contingencyTable.shape[0] - 1) + (contingencyTable.shape[1] - 1) - 2 # print matrixadj pval = np.round(chisquare(observed_frq, expected_frq,ddof=matrixadj)[1],3) if numless1>0 or perless5>=0.2: return str(pval)+"*" else: return pval
def kftest(df, column, label, tag): df_tmp = df.loc[:, [column, label]] df_tmp = df_tmp.dropna() col = dict(pd.value_counts(df_tmp[column])) lab = dict(pd.value_counts(df_tmp[label])) f_obs = [] f_exp = [] obs_d = {} for i in col: for j in lab: obs = sum([1 \ if df_tmp.iloc[k][column] == i and df_tmp.iloc[k][label] == j \ else 0 for k in range(len(df_tmp))]) obs_d.setdefault(j, {}) obs_d[j][i] = obs f_obs.append(obs) f_exp.append(1. * lab[j] / (sum(lab.values())) * col[i]) statics, p_value = chisquare(f_obs, f_exp, ddof=len(f_obs) - 2) str1 = "%d(%f),%d(%f),%d(%f),%f,%f" % (col[tag], col[tag] * 1. / sum(col.values()), obs_d[0][tag], 1. * obs_d[0][tag] / sum(obs_d[0].values()), obs_d[1][tag], 1. * obs_d[1][tag] / sum(obs_d[1].values()), statics, p_value) return str1
def kftest(df, column, label, tag): df_tmp = df.loc[:, [column, label]] df_tmp = df_tmp.dropna() col = dict(pd.value_counts(df_tmp[column])) lab = dict(pd.value_counts(df_tmp[label])) f_obs = [] f_exp = [] obs_d = {} for i in col: for j in lab: obs = sum([1 \ if df_tmp.iloc[k][column] == i and df_tmp.iloc[k][label] == j \ else 0 for k in range(len(df_tmp))]) obs_d.setdefault(j, {}) obs_d[j][i] = obs f_obs.append(obs) f_exp.append(1. * lab[j] / (sum(lab.values())) * col[i]) statics, p_value = chisquare(f_obs, f_exp, ddof=len(f_obs) - 2) str1 = "%d(%f),%d(%f),%d(%f),%f,%f" % ( col[tag], col[tag] * 1. / sum(col.values()), obs_d[0][tag], 1. * obs_d[0][tag] / sum(obs_d[0].values()), obs_d[1][tag], 1. * obs_d[1][tag] / sum(obs_d[1].values()), statics, p_value) return str1
def chisquare_test(observed0, expected0): observed = np.array(observed0) expected = np.array(expected0) if min(observed0) > 5: a = chisquare(observed, expected) else: a = fisher_exact([observed, expected]) return a[1]
def calculate_chi_square(self): if self.expected is None: expected = self.calculate_expected_values() else: expected = self.expected stat_and_p_value = statistics.chisquare(self.observe, f_exp=expected) return stat_and_p_value
def test_chisquare_ddof_broadcasting(): # Test that ddof broadcasts correctly. # obs has shape (4, 2). We'll use the default axis=0, so chi2 # will have shape (2,). obs = np.array([[1, 2, 3, 2], [3, 2, 2, 5]]).T # ddof has shape (2, 1). This is broadcast with chi2, so p will # have shape (2,2). ddof = np.array([[0], [1]]) chi2, p = mstats.chisquare(obs, ddof=ddof) assert_array_equal(chi2, [1.0, 2.0]) chi20, p0 = mstats.chisquare(obs, ddof=ddof[0, 0]) assert_array_equal(chi20, [1.0, 2.0]) chi21, p1 = mstats.chisquare(obs, ddof=ddof[1, 0]) assert_array_equal(chi21, [1.0, 2.0]) assert_array_equal(p, np.vstack((p0, p1)))
def test_chisquare_ddof_broadcasting(): # Test that ddof broadcasts correctly. # obs has shape (4, 2). We'll use the default axis=0, so chi2 # will have shape (2,). obs = np.array([[1, 2, 3, 2], [3, 2, 2, 5]]).T # ddof has shape (2, 1). This is broadcast with chi2, so p will # have shape (2,2). ddof = np.array([[0], [1]]) chi2, p = mstats.chisquare(obs, ddof=ddof) assert_array_equal(chi2, [1.0, 2.0]) chi20, p0 = mstats.chisquare(obs, ddof=ddof[0,0]) assert_array_equal(chi20, [1.0, 2.0]) chi21, p1 = mstats.chisquare(obs, ddof=ddof[1,0]) assert_array_equal(chi21, [1.0, 2.0]) assert_array_equal(p, np.vstack((p0, p1)))
def test_chisquare_masked_arrays(): # The other tests were taken from the tests for stats.chisquare, so # they don't test the function with masked arrays. Here masked arrays # are tested. obs = np.array([[8, 8, 16, 32, -1], [-1, -1, 3, 4, 5]]).T mask = np.array([[0, 0, 0, 0, 1], [1, 1, 0, 0, 0]]).T mobs = ma.masked_array(obs, mask) expected_chisq = np.array([24.0, 0.5]) chisq, p = mstats.chisquare(mobs) assert_array_equal(chisq, expected_chisq) assert_array_almost_equal(p, stats.chisqprob(expected_chisq, mobs.count(axis=0) - 1)) chisq, p = mstats.chisquare(mobs.T, axis=1) assert_array_equal(chisq, expected_chisq) assert_array_almost_equal(p, stats.chisqprob(expected_chisq, mobs.T.count(axis=1) - 1)) # When axis=None, the two values should have type np.float64. chisq, p = mstats.chisquare([1,2,3], axis=None) assert_(isinstance(chisq, np.float64)) assert_(isinstance(p, np.float64)) assert_equal(chisq, 1.0) assert_almost_equal(p, stats.chisqprob(1.0, 2))
def solve(self, data, targetname, targetpara): if targetname == 'Test_Uniform_Discrete': # example test to see if aggregated by some discrete value, the number of sequence follows an uniform distritbution nb_study = len(targetpara) for i_study in range(nb_study): var_study = data[targetpara[i_study]] value_list = numpy.unique(var_study) value_list.sort() var_nb = numpy.empty(len(value_list)) for i_value in range(len(value_list)): var_nb[i_value] = sum(var_study == value_list[i_value]) test_chi2 = stats.chisquare(var_nb) if test_chi2[1] > 0.005: self.conclusion.append(targetpara[i_study] + ' follows approximately a discrete uniform distribution on: \n ' + str(value_list.values) + '.') else: self.conclusion.append(targetpara[i_study] + ' does not follow a discrete uniform distribution on: \n ' + str(value_list.values) + '.') return
# -*- coding: utf-8 -*- """ Created on Fri Oct 7 14:07:45 2011 @author: Sat Kumar Tomer @website: www.ambhas.com @email: [email protected] """ # import required modules from scipy.stats.mstats import chisquare import numpy as np f_obs = np.array([10, 15, 20, 30]) f_exp = np.array([10, 5, 15, 30]) c, p = chisquare(f_obs, f_exp) print(c, p)
import numpy as np from scipy.stats.mstats import chisquare from scipy.stats import fisher_exact observed = np.array([2, 188]) expected = np.array([1, 80]) a = chisquare(observed, expected) b = fisher_exact([observed, expected]) print a print b
import numpy as np from scipy.stats.mstats import chisquare from scipy.stats import fisher_exact observed = np.array([34, 111]) expected = np.array([71, 281]) a = chisquare(observed, expected) b = fisher_exact([observed, expected]) print a print b
# -*- coding: utf-8 -*- """ Created on Fri Oct 7 14:07:45 2011 @author: Sat Kumar Tomer @website: www.ambhas.com @email: [email protected] """ # import required modules from scipy.stats.mstats import chisquare import numpy as np f_obs = np.array([10, 15, 20, 30]) f_exp = np.array([10, 5, 15, 30]) c, p = chisquare(f_obs, f_exp) print(c,p)
def test_chisquare(guys): """ Get the chi-square p value of the stream """ counts = count_digs(guys) stat, p = chisquare(counts) return p