Example #1
0
 def runTest(self):
     # these values are from pchisq as found in R version 2.5.1
     # it would be nice if there were some properties of chisqprob
     # that could be verified but I could not find any.
     self.assertAlmostEquals(chisqprob(1,1),0.6826895,6)
     for k in xrange(100):
         self.assertEquals(chisqprob(0,k), 0)
     for i,val in enumerate([0.0000000,0.6826895,0.6321206,0.6083748,
                             0.5939942,0.5841198,0.5768099,0.5711201,
                             0.5665299,0.5627258,0.5595067]):
         self.assertAlmostEquals(chisqprob(i,i),val,6)
Example #2
0
    def test_independ(self,x,y,s):
        """Test if x _|_ y | s. returns (p,s,d) where p is the probability of
        the available data, given the null hypothesis of independence,
        s is the test statistic, and d is the number of degrees of freedom in
        the available data for this test.
        """
        vars = tuple(s)

        # extract the marginal factors and broadcast out to full instances (for zero censoring)
        joint_counts = self._data.makeFactor(vars + (x,y))
        marginal_counts_x = self._data.makeFactor(vars + (x,))
        marginal_counts_y = self._data.makeFactor(vars + (y,))
        marginal_counts_s = self._data.makeFactor(vars)

        # calculate the number of degrees of freedom. this is equal to
        # (r - 1)(c - 1)\prod_{s \in S} s
        # if there exists a condition s, such that a row or column sum is zero,
        # then we do not count that row or column.
        i_x = sorted(marginal_counts_x.variables()).index(x)
        i_y = sorted(marginal_counts_y.variables()).index(y)
        d = 0
        n_x = self._data.numvals(x)-1
        n_y = self._data.numvals(y)-1
        for s_inst in marginal_counts_s.insts():
            if marginal_counts_s[s_inst] == 0:
                continue

            # count censored x values
            c_x = 0
            x_inst = list(s_inst[:i_x] + (None,) + s_inst[i_x:])
            for v in self._data.values(x):
                x_inst[i_x] = v
                if marginal_counts_x[x_inst] == 0:
                    c_x += 1

            # count censored y values
            c_y = 0
            y_inst = list(s_inst[:i_y] + (None,) + s_inst[i_y:])
            for v in self._data.values(y):
                y_inst[i_y] = v
                if marginal_counts_y[y_inst] == 0:
                    c_y += 1

            # the PC heuristic decrements the cell counts of each row/column by
            # each censored value (provided this does not result in a negative
            # DoF)
            if c_x < n_x and c_y < n_y:
                d += (n_x - c_x)*(n_y - c_y)

        if d <= 0:
            # Chi^2 is undefined!
            # we favour sparse representations
            # H_0 is favouring sparseness
            # this is what Tetrad does.
            return 1.0, 0.0, d

        # convert marginal factor elements to floats from ints
        marginal_counts_x.map(float)
        marginal_counts_y.map(float)
        marginal_counts_s.map(float)
        joint_counts.map(float)

        # expand out all factors to include all relevant variables
        marginal_counts_x = marginal_counts_x.broadcast(joint_counts.variables())
        marginal_counts_y = marginal_counts_y.broadcast(joint_counts.variables())
        marginal_counts_s = marginal_counts_s.broadcast(joint_counts.variables())

        # calculate some statistic that is distributed as Chi^2 though this may
        # only be asymptotically distributed.
        statistic = self.statistic(joint_counts, marginal_counts_x, marginal_counts_y, marginal_counts_s)

        # related to Chi^2 distribution
        # likelihood of independence is P(Chi > X2 | H0)
        # chisqprob is the c.d.f., so we have 1 - P(Chi <= X | H0)
        p = 1 - chisqprob(statistic, d)
        return p, statistic, d