def runTest(self): # these values are from pchisq as found in R version 2.5.1 # it would be nice if there were some properties of chisqprob # that could be verified but I could not find any. self.assertAlmostEquals(chisqprob(1,1),0.6826895,6) for k in xrange(100): self.assertEquals(chisqprob(0,k), 0) for i,val in enumerate([0.0000000,0.6826895,0.6321206,0.6083748, 0.5939942,0.5841198,0.5768099,0.5711201, 0.5665299,0.5627258,0.5595067]): self.assertAlmostEquals(chisqprob(i,i),val,6)
def test_independ(self,x,y,s): """Test if x _|_ y | s. returns (p,s,d) where p is the probability of the available data, given the null hypothesis of independence, s is the test statistic, and d is the number of degrees of freedom in the available data for this test. """ vars = tuple(s) # extract the marginal factors and broadcast out to full instances (for zero censoring) joint_counts = self._data.makeFactor(vars + (x,y)) marginal_counts_x = self._data.makeFactor(vars + (x,)) marginal_counts_y = self._data.makeFactor(vars + (y,)) marginal_counts_s = self._data.makeFactor(vars) # calculate the number of degrees of freedom. this is equal to # (r - 1)(c - 1)\prod_{s \in S} s # if there exists a condition s, such that a row or column sum is zero, # then we do not count that row or column. i_x = sorted(marginal_counts_x.variables()).index(x) i_y = sorted(marginal_counts_y.variables()).index(y) d = 0 n_x = self._data.numvals(x)-1 n_y = self._data.numvals(y)-1 for s_inst in marginal_counts_s.insts(): if marginal_counts_s[s_inst] == 0: continue # count censored x values c_x = 0 x_inst = list(s_inst[:i_x] + (None,) + s_inst[i_x:]) for v in self._data.values(x): x_inst[i_x] = v if marginal_counts_x[x_inst] == 0: c_x += 1 # count censored y values c_y = 0 y_inst = list(s_inst[:i_y] + (None,) + s_inst[i_y:]) for v in self._data.values(y): y_inst[i_y] = v if marginal_counts_y[y_inst] == 0: c_y += 1 # the PC heuristic decrements the cell counts of each row/column by # each censored value (provided this does not result in a negative # DoF) if c_x < n_x and c_y < n_y: d += (n_x - c_x)*(n_y - c_y) if d <= 0: # Chi^2 is undefined! # we favour sparse representations # H_0 is favouring sparseness # this is what Tetrad does. return 1.0, 0.0, d # convert marginal factor elements to floats from ints marginal_counts_x.map(float) marginal_counts_y.map(float) marginal_counts_s.map(float) joint_counts.map(float) # expand out all factors to include all relevant variables marginal_counts_x = marginal_counts_x.broadcast(joint_counts.variables()) marginal_counts_y = marginal_counts_y.broadcast(joint_counts.variables()) marginal_counts_s = marginal_counts_s.broadcast(joint_counts.variables()) # calculate some statistic that is distributed as Chi^2 though this may # only be asymptotically distributed. statistic = self.statistic(joint_counts, marginal_counts_x, marginal_counts_y, marginal_counts_s) # related to Chi^2 distribution # likelihood of independence is P(Chi > X2 | H0) # chisqprob is the c.d.f., so we have 1 - P(Chi <= X | H0) p = 1 - chisqprob(statistic, d) return p, statistic, d