Beispiel #1
0
def test_chisquare_masked_arrays():
    # The other tests were taken from the tests for stats.chisquare, so
    # they don't test the function with masked arrays.  Here masked arrays
    # are tested.
    obs = np.array([[8, 8, 16, 32, -1], [-1, -1, 3, 4, 5]]).T
    mask = np.array([[0, 0, 0, 0, 1], [1, 1, 0, 0, 0]]).T
    mobs = ma.masked_array(obs, mask)
    expected_chisq = np.array([24.0, 0.5])

    chisq, p = mstats.chisquare(mobs)
    assert_array_equal(chisq, expected_chisq)
    assert_array_almost_equal(
        p, stats.chisqprob(expected_chisq,
                           mobs.count(axis=0) - 1))

    chisq, p = mstats.chisquare(mobs.T, axis=1)
    assert_array_equal(chisq, expected_chisq)
    assert_array_almost_equal(
        p, stats.chisqprob(expected_chisq,
                           mobs.T.count(axis=1) - 1))

    # When axis=None, the two values should have type np.float64.
    chisq, p = mstats.chisquare([1, 2, 3], axis=None)
    assert_(isinstance(chisq, np.float64))
    assert_(isinstance(p, np.float64))
    assert_equal(chisq, 1.0)
    assert_almost_equal(p, stats.chisqprob(1.0, 2))
Beispiel #2
0
def sp_tests(reg):
    """
    Calculates tests for spatial dependence in Probit models

    Parameters
    ----------

    reg         : regression object
                  output instance from a probit model            
    """    
    if reg.w:
        w = reg.w.sparse
        Phi = reg.predy
        phi = reg.phiy                
        #Pinkse_error:
        Phi_prod = Phi * (1 - Phi)
        u_naive = reg.u_naive
        u_gen = reg.u_gen
        sig2 = np.sum((phi * phi) / Phi_prod) / reg.n
        LM_err_num = np.dot(u_gen.T,(w * u_gen))**2
        trWW = np.sum((w*w).diagonal())
        trWWWWp = trWW + np.sum((w*w.T).diagonal())
        LM_err = float(1.0 * LM_err_num / (sig2**2 * trWWWWp))
        LM_err = np.array([LM_err,chisqprob(LM_err,1)])
        #KP_error:
        moran = moran_KP(reg.w,u_naive,Phi_prod)
        #Pinkse-Slade_error:
        u_std = u_naive / np.sqrt(Phi_prod)
        ps_num = np.dot(u_std.T, (w * u_std))**2
        trWpW = np.sum((w.T*w).diagonal())
        ps = float(ps_num / (trWW + trWpW))
        ps = np.array([ps,chisqprob(ps,1)]) #chi-square instead of bootstrap.
    else:
        raise Exception, "W matrix not provided to calculate spatial test."
    return LM_err,moran,ps
Beispiel #3
0
    def inspect_output_by_filter(self,
                                 rez,
                                 dat,
                                 doplot=False,
                                 test=False,
                                 sig_clips=[5, 3, 2],
                                 sig_test=[False, False, True]):
        p = rez.values()[0][1]
        myoutput = rez.values()[0][0]
        new = rez.values()[0][2]
        filt = rez.keys()[0]

        ret = {}
        ret.update({"all": self._extract_info(p, myoutput.sd_beta, myoutput)})
        err = dat[2]
        tmp = (dat[1] - self.modelfunc_small_te(p, dat[0])) / err
        dof = tmp.shape[0] - myoutput.beta.shape[0]
        chisq = (tmp**2).sum()
        ret['all'].update({"ndata": dat[0].shape[0], \
                            "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof),
                            "normalcy_prob": normaltest(tmp)[1]})

        for s in enumerate(sig_clips):
            if sig_test[s[0]] and not test:
                continue
            sig = s[1]
            # get the indices of those inside and out of the clip area
            tmpisig = (abs(tmp) < sig).nonzero()[0]
            tmpisige = (abs(tmp) > sig).nonzero()[0]
            frac_less_than_sig = float(tmpisig.shape[0]) / dat[0].shape[0]
            # print frac_less_than_sig
            if frac_less_than_sig < 1.0:
                out = self._filt_run([dat[0][tmpisig],dat[1][tmpisig],err[tmpisig]],\
                            filt,do_sim=False,vplot=False)
                p = out[1]
                myoutput = out[0]
                t = "-test" if sig_test[s[0]] else ""

                ret.update({
                    "sig" + str(sig) + t:
                    self._extract_info(p, myoutput.sd_beta, myoutput)
                })
                tmp = (dat[1][tmpisig] - self.modelfunc_small_te(
                    p, dat[0][tmpisig])) / err[tmpisig]
                dof = tmp.shape[0] - myoutput.beta.shape[0]
                chisq = (tmp**2).sum()
                try:
                    ntest = normaltest(tmp)[1]
                except:
                    ntest = 0.0
                ret["sig" + str(sig) + t].update({"ndata": dat[0][tmpisig].shape[0], \
                                    "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof),
                                    "normalcy_prob": ntest, "frac_data_remaining": frac_less_than_sig })
                if doplot:
                    plot(dat[0][tmpisige], dat[1][tmpisige], ".")

        return ret
Beispiel #4
0
    def _calculate_LRTs(self):
        """Run likelihood ratio test if there are enough results  """
        if all([m in self.keys() for m in [1, 2]]):
            D = -2 * self[1].lnL + 2 * self[2].lnL
            pval = chisqprob(D, 2)
            self.LRT_m1m2 = (D, pval)

        if all([m in self.keys() for m in [7, 8]]):
            D = -2 * self[7].lnL + 2 * self[8].lnL
            pval = chisqprob(D, 2)
            self.LRT_m7m8 = (D, pval)
    def _calculate_LRTs(self):
        """Run likelihood ratio test if there are enough results  """
        if all( [m in self.keys() for m in [1,2]] ):
            D = -2 * self[1].lnL + 2 * self[2].lnL
            pval = chisqprob(D,2)
            self.LRT_m1m2 = (D, pval)

        if all( [m in self.keys() for m in [7,8]] ):
            D = -2 * self[7].lnL + 2 * self[8].lnL
            pval = chisqprob(D,2)
            self.LRT_m7m8 = (D, pval)
Beispiel #6
0
 def LR(self):
     try:
         return self._cache['LR']
     except AttributeError:
         self._cache = {}
         P = 1.0 * np.sum(self.y) / self.n
         LR = float(-2 * (self.n * (P * np.log(P) +
                                    (1 - P) * np.log(1 - P)) - self.logl))
         self._cache['LR'] = (LR, chisqprob(LR, self.k))
     except KeyError:
         P = 1.0 * np.sum(self.y) / self.n
         LR = float(-2 * (self.n * (P * np.log(P) +
                                    (1 - P) * np.log(1 - P)) - self.logl))
         self._cache['LR'] = (LR, chisqprob(LR, self.k))
     return self._cache['LR']
Beispiel #7
0
 def LR(self):
     try:
         return self._cache['LR']
     except AttributeError:
         self._cache = {}
         P = 1.0 * np.sum(self.y) / self.n
         LR = float(
             -2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl))
         self._cache['LR'] = (LR, chisqprob(LR, self.k))
     except KeyError:
         P = 1.0 * np.sum(self.y) / self.n
         LR = float(
             -2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl))
         self._cache['LR'] = (LR, chisqprob(LR, self.k))
     return self._cache['LR']
Beispiel #8
0
	def inspect_output_by_filter(self,rez,dat,doplot=False,test=False,
	                             sig_clips=[5, 3, 2], sig_test=[False,False,True]):
		p = rez.values()[0][1]
		myoutput = rez.values()[0][0]
		new  = rez.values()[0][2]
		filt = rez.keys()[0]

		ret = {}
		ret.update({"all": self._extract_info(p,myoutput.sd_beta,myoutput)})
		err = dat[2]
		tmp = (dat[1] - self.modelfunc_small_te(p,dat[0]))/err
		dof = tmp.shape[0] -  myoutput.beta.shape[0]
		chisq = (tmp**2).sum()
		ret['all'].update({"ndata": dat[0].shape[0], \
		                    "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof),
		                    "normalcy_prob": normaltest(tmp)[1]})

		for s in enumerate(sig_clips):
			if sig_test[s[0]] and not test:
				continue
			sig = s[1]
			# get the indices of those inside and out of the clip area
			tmpisig = (abs(tmp) < sig).nonzero()[0]
			tmpisige = (abs(tmp) > sig).nonzero()[0]
			frac_less_than_sig =  float(tmpisig.shape[0])/dat[0].shape[0]
			# print frac_less_than_sig
			if frac_less_than_sig < 1.0:
				out = self._filt_run([dat[0][tmpisig],dat[1][tmpisig],err[tmpisig]],\
 			 					   	  filt,do_sim=False,vplot=False)
				p        = out[1]
				myoutput = out[0]
				t = "-test" if sig_test[s[0]] else ""
					
				ret.update({"sig" + str(sig) + t: self._extract_info(p,myoutput.sd_beta,myoutput)})
				tmp = (dat[1][tmpisig] - self.modelfunc_small_te(p,dat[0][tmpisig]))/err[tmpisig]
				dof = tmp.shape[0] - myoutput.beta.shape[0]
				chisq = (tmp**2).sum()
				try:
					ntest =  normaltest(tmp)[1]
				except:
					ntest = 0.0
				ret["sig" + str(sig) + t].update({"ndata": dat[0][tmpisig].shape[0], \
				                    "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof),
				                    "normalcy_prob": ntest, "frac_data_remaining": frac_less_than_sig })
				if doplot:
					plot(dat[0][tmpisige],dat[1][tmpisige],".")
			
		return ret
Beispiel #9
0
def pfisher( pvalues ):
    """
    Combine independent P-values into one according to

    Fisher, R. A. (1948) Combining independent tests of significance.
    American Statistician, vol. 2, issue 5, page 30.

    ('Fisher method' or 'inverse ChiSquare method') See also book:
    Walter W. Piegorsch, A. John Bailer: Analyzing Environmental Data.
    Wiley 2005

    @param pvalues: list of independent P-values
    @type  pvalues: [ float ]
    @return: P-value 
    @rtype: float
    """
    ## stats.mannwhitneyu minimal P ~ stats.zprob( 8.2 );
    ## all below becomes 0. which is not handled by the fisher test 
    clipped = N.clip( pvalues, 1.0e-16, 1.0 )

    x2 = -2 * N.sum( N.log( clipped ) )

    if not USING_SCIPY:
        x2 = float( x2 )

    return stats.chisqprob( x2, 2*len(pvalues) )
Beispiel #10
0
    def _calculate_hwe(self, snp, genotypes):

        """
        Calculates p-value for HWE using ChiSquare statistic: remove missing, get observed counts, get observed
        frequencies, get expected counts, calculate test values using (O-E)**2 / E and return ChiSquare probability
        with 1 degree of Freedom (bi-allelic SNP).

        """

        adjusted_samples = self.sample_size - genotypes.count(self.missing)
        hetero_obs, major_obs, minor_obs = self._get_observed(genotypes)

        try:
            p = (major_obs + (hetero_obs/2)) / adjusted_samples
            q = (minor_obs + (hetero_obs/2)) / adjusted_samples
        except ZeroDivisionError:
            #print("Detected complete missing data in SNP:", snp)
            return 0

        if (p + q) != 1:
            ValueError("Sum of observed allele frequencies (p + q) does not equal one.")

        hetero_exp, major_exp, minor_exp = self._get_expected(p, q, adjusted_samples)

        try:
            hetero_test = ((hetero_obs-hetero_exp)**2)/hetero_exp
            major_test = ((major_obs-major_exp)**2)/major_exp
            minor_test = ((minor_obs-minor_exp)**2)/minor_exp
        except ZeroDivisionError:
            return 0

        return stats.chisqprob(sum([hetero_test, major_test, minor_test]), 1)
Beispiel #11
0
def hardy_weinberg_asymptotic(obs_het, obs_a , obs_b):
    obs_het = float(obs_het)
    obs_a = float(obs_a)
    obs_b = float(obs_b)
    sample_size = obs_het + obs_a + obs_b
    p = (((2 * obs_a) + obs_het) / ( 2 * (sample_size)))
    q = 1 - p 
    exp_a = p * p * sample_size 
    exp_b = q * q * sample_size
    exp_ab = 2 * p * q * sample_size
    
    # get chiSquare values
    if(exp_a == 0):
        chi_a = 0
    else:
        chi_a = ((obs_a - exp_a) * 2.0) / exp_a
    if(exp_b == 0):
        chi_b = 0
    else:
        chi_b = ((obs_b - exp_b) * 2.0) / exp_b
    if(exp_ab == 0):
        chi_ab = 0
    else:
        chi_ab = ((obs_het - exp_ab) * 2.0 ) / exp_ab
    chi_sq_total = chi_a + chi_b + chi_ab
    return stats.chisqprob(chi_sq_total, 1)    
Beispiel #12
0
    def gof(self, x, y, ye):
        '''
        Computes GoF test statistics and other diagnostical tests

        Returns:
        --------
        - GoF test: Chi^2, p-value, and ddof
        - Normality of residuals: K^2 and p-value
        '''
        res = {}
        resid = y - self(x)
        chisq = np.sum(((resid) / ye)**2)
        ddof = len(x) - len(filter(
            None, self.errors()))  # number of estimated parameters
        chisq_pvalue = chisqprob(chisq, ddof)
        gof = (chisq, chisq_pvalue, ddof)
        resid = normaltest(resid)
        ym = y.mean()
        SStot = np.sum((y - ym)**2)
        SSerr = np.sum((y - self(x))**2)
        Rsquared = 1.0 - SSerr / SStot
        # Besides being buggy, this test for homoscedasticity is supposed to work only
        # for linear regressions, hence is not suited for our case, but I'll keep it
        # here until I figure out an alternative. Remember to uncomment the import for
        # OLS ontop.
        #        regresults = OLS(resid ** 2, np.c_[x, x**2]).fit()
        #        LM =regresults.rsquared
        #        LM_pvalue = chisqprob(LM, len(x) - ddof)
        #        white = (LM, LM_pvalue)
        #        return gof, resid, white
        return gof, resid, Rsquared
    def test_for_pihm_w_likelihood(self, guess=[10, -5],
                        k_array=np.linspace(0.05, 2, 100),
                        fixed_pre=False, disp=True):
        """
        Test a dataset for parasite induced host mortality using the likelihood
        method

        This method compares a reduced model (negative binomial distribution)
        to a full model (Negative binomail with PIHM).  The two models are
        nested and differ by two parameters: a and b.  This amounts to fitting
        the model a negative binomial distribution to the data, then fitting
        the full model to the data and comparing likelihoods using a likelihood
        ratio test.  The likelihood ratio should be approximately chi-squared
        with the dof equal to the difference in the parameters.

        Parameters
        ----------
        guess : list
            Guesses for a and b
        k_array : array
            Array of values over which to search to best fit k
        fixed_pre : bool
            If True, the premortality parameters are fixed (mup and kp). Else,
            they are jointly estimated from the data
        disp : bool
            If True, convergence message is printed.  If False, no convergence
            method is printed


        Returns
        -------
        : chi_squared valued, p-value, full nll, reduced nll,
        """

        # No params are known
        if not fixed_pre:

            # Get full nll
            params = self.likelihood_method(full_fit=True, guess=guess,
                        disp=disp)
            full_nll = likefxn1(params, self.data)

            mle_fit = mod.nbinom.fit_mle(self.data, k_array=k_array)

            red_nll = comp.nll(self.data, mod.nbinom(*mle_fit))

        # Params are known
        else:

            params = self.likelihood_method(full_fit=False, guess=guess,
                        disp=disp)
            full_nll = likefxn2(params[2:], self.data, self.mup, self.kp)

            red_nll = comp.nll(self.data, mod.nbinom(self.mup, self.kp))

        # Approximately chi-squared...though this is a large sample size approx
        chi_sq = 2 * (-full_nll - (-red_nll))
        prob = chisqprob(chi_sq, 2)

        return chi_sq, prob, full_nll, red_nll, params, (mup, kp, a, b)
Beispiel #14
0
 def get_pValue(self, mutheta, sigma=None):
     minusPointLogLike = self.like(mutheta, sigma)
     minusMaxLogLike = self.ml
     size = 2
     CS = 2.0 * (minusPointLogLike - minusMaxLogLike)
     pValue = stats.chisqprob(CS, size)
     return pValue
Beispiel #15
0
def fit1D(binnedE, binnedT, fracsE, fracsT, nevents, PearsonErrs=True,
          debug=False):
    # Concatenate data and prediction vectors
    datavec = np.append(binnedE,binnedT)
    predvec = [np.append(fracsE[0],fracsT[0]), np.append(fracsE[1],fracsT[1])]
    # Define inputs for minimizer
    predfunc = lambda p: p[0]*predvec[0] + p[1]*predvec[1]
    func = lambda : 1
    if PearsonErrs: func = lambda p: (datavec - predfunc(p))/np.sqrt(predfunc(p))
    else: func = lambda p: (datavec - predfunc(p))/np.sqrt(datavec)

    pfit, pcov, infodict, errmsg, success = sp.optimize.leastsq(func, nevents,
                                                                full_output=1)
    chi2 = sum([elem**2 for elem in infodict['fvec']]) 
    #mychi2 = sum([(datavec[i] - predfunc(pfit)[i])**2./predfunc(pfit)[i] for i in xrange(len(datavec))])  ### This just equals 'chi2' calculated above
    dof = datavec.size - 2
    pval = st.chisqprob(chi2, dof)
    if debug:
        print '---------------------- 1-D Fit --------------------------------'
        print 'Best fits: %s' % pfit
        print 'Cov. mat.: %s' % pcov
        print 'Chi^2: %s' % chi2
        print 'd.o.f.: %s' % dof
        print 'P-value: %s' % pval
        print '---------------------------------------------------------------'
    return pfit, pcov, chi2, pval
Beispiel #16
0
def chisq_poisson(data):
    '''
    Tests if the data comes from a Poisson distribution. This is done using
    the Pearson Chi-Square test. Each value from the data given is treated like
    a categorical attribute, where the number of occurrences of the value is
    tested against the expected occurrences if the data came from a Poisson
    distribution. The hypothesis are:
        
        * H0 (null) - The data comes from a poisson distribution
        * H1 - The data does NOT comes from a poisson distribution
    
    Arguments
    ---------
    data: array like
        Array with observations
    
    Returns
    -------
    (chi-square value, p-value): The chi-square value found and a p-value
                                 for the null hypothesis.
    
    Notes
    -----
    This implementation does not do any special treatment for values with
    small number of occurrences. 
    '''
    all_freqs, expected_freqs = _poisson_inputs(data)
    chisq = stats.chisquare(all_freqs, expected_freqs)[0]
    pval = stats.chisqprob(chisq, len(all_freqs) - 2)
    return chisq, pval
Beispiel #17
0
def chisquare(obs, exp=None):
    """Compute the chisquare value of a contingency table with arbitrary
    dimensions.

    If no expected frequencies are supplied, the total N is assumed to be
    equally distributed across all cells.

    Returns: chisquare-stats, associated p-value (upper tail)
    """
    obs = N.array(obs)

    # get total number of observations
    nobs = N.sum(obs)

    # if no expected value are supplied assume equal distribution
    if exp == None:
        exp = N.ones(obs.shape) * nobs / N.prod(obs.shape)

    # make sure to have floating point data
    exp = exp.astype(float)

    # compute chisquare value
    chisq = N.sum((obs - exp )**2 / exp)

    # return chisq and probability (upper tail)
    return chisq, stats.chisqprob(chisq, N.prod(obs.shape) - 1)
Beispiel #18
0
def computeContingencyTablePValue(*observedTuples):
    if len(observedTuples) == 0: return None

    rowSums = []

    for row in observedTuples:
        rowSums.append(float(sum(row)))

    columnSums = []
    for i in range(len(observedTuples[0])):
        columnSum = 0.0

        for row in observedTuples:
            columnSum += row[i]

        columnSums.append(float(columnSum))

    grandTotal = float(sum(rowSums))
    observedTestStatistic = 0.0

    for i in range(len(observedTuples)):
        for j in range(len(row)):
            expectedValue = (rowSums[i]/grandTotal)*(columnSums[j]/grandTotal)*grandTotal
            observedValue = float(observedTuples[i][j])
            observedTestStatistic += ((observedValue - expectedValue)**2) / expectedValue

    degreesFreedom = (len(columnSums) - 1) * (len(rowSums) - 1)

    from scipy.stats import chisqprob
    return chisqprob(observedTestStatistic, degreesFreedom)
Beispiel #19
0
def chisquare(obs, exp='uniform'):
    """Compute the chisquare value of a contingency table with arbitrary
    dimensions.

    Parameters
    ----------
    obs : array
      Observations matrix
    exp : ('uniform', 'indep_rows') or array, optional
      Matrix of expected values of the same size as `obs`.  If no
      array is given, then for 'uniform' -- evenly distributes all
      observations.  In 'indep_rows' case contingency table takes into
      account frequencies relative across different columns, so, if
      the contingency table is predictions vs targets, it would
      account for dis-balance among different targets.  Although
      'uniform' is the default, for confusion matrices 'indep_rows' is
      preferable.

    Returns
    -------
    tuple
     chisquare-stats, associated p-value (upper tail)
    """
    obs = np.array(obs)

    # get total number of observations
    nobs = np.sum(obs)

    # if no expected value are supplied assume equal distribution
    if not isinstance(exp, np.ndarray):
        ones = np.ones(obs.shape, dtype=float)
        if exp == 'indep_rows':
            # multiply each column
            exp = np.sum(obs, axis=0)[None, :] * ones / obs.shape[0]
        elif exp == 'indep_cols':
            # multiply each row
            exp = np.sum(obs, axis=1)[:, None] * ones / obs.shape[1]
        elif exp == 'uniform':
            # just evenly distribute
            exp = nobs * np.ones(obs.shape, dtype=float) / np.prod(obs.shape)
        else:
            raise ValueError(
                "Unknown specification of expected values exp=%r" % (exp, ))
    else:
        assert (exp.shape == obs.shape)

    # make sure to have floating point data
    exp = exp.astype(float)

    # compute chisquare value
    exp_zeros = exp == 0
    exp_nonzeros = np.logical_not(exp_zeros)
    if np.sum(exp_zeros) != 0 and (obs[exp_zeros] != 0).any():
        raise ValueError("chisquare: Expected values have 0-values, but there are actual" \
              " observations -- chi^2 cannot be computed")
    chisq = np.sum(((obs - exp)**2)[exp_nonzeros] / exp[exp_nonzeros])

    # return chisq and probability (upper tail)
    # taking only the elements with something expected
    return chisq, st.chisqprob(chisq, np.sum(exp_nonzeros) - 1)
Beispiel #20
0
def pfisher(pvalues):
    """
    Combine independent P-values into one according to

    Fisher, R. A. (1948) Combining independent tests of significance.
    American Statistician, vol. 2, issue 5, page 30.

    ('Fisher method' or 'inverse ChiSquare method') See also book:
    Walter W. Piegorsch, A. John Bailer: Analyzing Environmental Data.
    Wiley 2005

    @param pvalues: list of independent P-values
    @type  pvalues: [ float ]
    @return: P-value 
    @rtype: float
    """
    ## stats.mannwhitneyu minimal P ~ stats.zprob( 8.2 );
    ## all below becomes 0. which is not handled by the fisher test
    clipped = N.clip(pvalues, 1.0e-16, 1.0)

    x2 = -2 * N.sum(N.log(clipped))

    if not USING_SCIPY:
        x2 = float(x2)

    return stats.chisqprob(x2, 2 * len(pvalues))
Beispiel #21
0
 def combine_fisher(self, pvalue1, pvalue2):
     if pvalue1 == 0.0 or pvalue2 == 0.0:
         return 0.0
     else:
         chi = -2.0 * (math.log(pvalue1) + math.log(pvalue2))
         p_out = chisqprob(chi, 4)
         return p_out
def get_p_value_pearson_chi_squared(contingency_table):
	
	(n11,n12,n21,n22) = contingency_table

	n = n11+n12+n21+n22
	
	if n == 0:
		
		raise "The contingency table is empty"
	
	n_1_plus = float(n11 + n12)
	n_2_plus = float(n21 + n22)
	n_plus_1 = float(n11 + n21)
	n_plus_2 = float(n12 + n22)
	
	if n == n_1_plus:

		return float(1)
	
	elif n == n_2_plus:
		
		return float(1)
		
	# eij = (n_i_plus)(n_plus_j)/n
	e11 = (n_1_plus)*(n_plus_1)/n
	e12 = (n_1_plus)*(n_plus_2)/n
	e21 = (n_2_plus)*(n_plus_1)/n
	e22 = (n_2_plus)*(n_plus_2)/n
					
	chi2 = (math.pow(n11-e11,2)/e11) + (math.pow(n12-e12,2)/e12) + (math.pow(n21-e21,2)/e21) + (math.pow(n22-e22,2)/e22)

	p_value = chisqprob(chi2,1)

	return p_value
def chi_square(hist_1, hist_2):
    diff_1 = hist_1 - hist_2
    val = np.nansum((np.power(diff_1, 2))/(hist_1+hist_2))
    ddof = len(hist_1)

    print('T = {}'.format(val))
    print('P(chi^2 > T) = {}'.format(chisqprob(val, ddof)))
Beispiel #24
0
    def _calculate_hwe(self, snp, genotypes):

        """
        Calculates p-value for HWE using ChiSquare statistic: remove missing, get observed counts, get observed
        frequencies, get expected counts, calculate test values using (O-E)**2 / E and return ChiSquare probability
        with 1 degree of Freedom (bi-allelic SNP).

        """

        adjusted_samples = self.sample_size - genotypes.count(self.missing)
        hetero_obs, major_obs, minor_obs = self._get_observed(genotypes)

        try:
            p = (major_obs + (hetero_obs / 2)) / adjusted_samples
            q = (minor_obs + (hetero_obs / 2)) / adjusted_samples
        except ZeroDivisionError:
            # print("Detected complete missing data in SNP:", snp)
            return 0

        if (p + q) != 1:
            ValueError("Sum of observed allele frequencies (p + q) does not equal one.")

        hetero_exp, major_exp, minor_exp = self._get_expected(p, q, adjusted_samples)

        try:
            hetero_test = ((hetero_obs - hetero_exp) ** 2) / hetero_exp
            major_test = ((major_obs - major_exp) ** 2) / major_exp
            minor_test = ((minor_obs - minor_exp) ** 2) / minor_exp
        except ZeroDivisionError:
            return 0

        return stats.chisqprob(sum([hetero_test, major_test, minor_test]), 1)
Beispiel #25
0
def check_chisquare(f_obs, f_exp, ddof, axis, expected_chi2):
    # Use this only for arrays that have no masked values.
    f_obs = np.asarray(f_obs)
    if axis is None:
        num_obs = f_obs.size
    else:
        if axis == 'no':
            use_axis = 0
        else:
            use_axis = axis
        b = np.broadcast(f_obs, f_exp)
        num_obs = b.shape[use_axis]

    if axis == 'no':
        chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof)
    else:
        chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof, axis=axis)
    assert_array_equal(chi2, expected_chi2)

    ddof = np.asarray(ddof)
    expected_p = stats.chisqprob(expected_chi2, num_obs - 1 - ddof)
    assert_array_equal(p, expected_p)

    # Also compare to stats.chisquare
    if axis == 'no':
        stats_chisq, stats_p = stats.chisquare(f_obs, f_exp=f_exp, ddof=ddof)
    else:
        stats_chisq, stats_p = stats.chisquare(f_obs,
                                               f_exp=f_exp,
                                               ddof=ddof,
                                               axis=axis)
    assert_array_almost_equal(chi2, stats_chisq)
    assert_array_almost_equal(p, stats_p)
Beispiel #26
0
	def plot_data_key(self, nm, conn, gnm=None):
	
		self.gather_points(nm, conn)
		self.gdat = [ [n, self.datpts[k].value * self.yscale, self.datpts[k].err * self.yscale] for (n,k) in enumerate(self.datkeys) if self.datpts[k].value is not None]
		
		try:
			self.LF.fit(self.gdat,cols=(0,1,2),errorbarWeights=True)
			chi2 = self.LF.chisquared()
			ndf = self.LF.nu()
			statdat = {"mu":self.LF.coeffs[0], "rms":self.LF.rmsDeviation(), "uncert":self.LF.coeffErr(0), "chi2":chi2 , "ndf":ndf}
			if stats:
				statdat["prob"] = stats.chisqprob(statdat["chi2"],statdat["ndf"])
			else:
				statdat["prob"] = 0
			if gnm:
				gnm = gnm%statdat
			print gnm
			self.g.plot(graph.data.function("y(x)=%g"%self.LF.coeffs[0], title=None), [ graph.style.line(lineattrs=[self.ptcolor,style.linestyle.dashed]),])
		except:
			gnm = None
			
		self.g.axes['x'].max = self.gdat[-1][0]
					
		self.g.plot(graph.data.points(self.gdat,x=1,y=2,dy=3,title=gnm), [ graph.style.errorbar(errorbarattrs=[self.ptcolor]),
																		graph.style.symbol(self.ptsymb, size=0.15, symbolattrs = [self.ptcolor])])
Beispiel #27
0
def walds_test(profile1, profile2):
    """Calculate the compatibility of two statistically independent
    measurements using normal approximation (Wald's method).

    This assumes that the log-likelihood space is approximately elliptically.

    Parameters
    ----------
    profile1 : (x,y,llh) for measurement 1
    profile2 : (x,y,llh) for measurement 2

    """
    from scipy.stats import chisqprob
    from scipy.special import erfinv
    bestfits, covariances = [], []
    for x, y, llhs in [profile1, profile2]:
        idx_min = np.unravel_index(llhs.argmin(), llhs.shape)
        bestfit = x[idx_min[1]], y[idx_min[0]]
        bestfits.append(bestfit)
        covariance = estimate_cov_from_contour(x, y, llhs, bestfit)
        covariances.append(covariance)

    diff = np.matrix(bestfits[0]) - np.matrix(bestfits[1])
    cov_inv = np.linalg.inv(covariances[0] + covariances[1])

    chi2 = diff*cov_inv*diff.transpose()
    ndof = 2
    pvalue = chisqprob(chi2, ndof)
    nsigma = erfinv(1-pvalue) * np.sqrt(2) # 2-sided significance

    return (chi2, ndof, pvalue, nsigma)
Beispiel #28
0
def get_p_value_pearson_chi_squared(contingency_table):

    (n11, n12, n21, n22) = contingency_table

    n = n11 + n12 + n21 + n22

    if n == 0:

        raise "The contingency table is empty"

    n_1_plus = float(n11 + n12)
    n_2_plus = float(n21 + n22)
    n_plus_1 = float(n11 + n21)
    n_plus_2 = float(n12 + n22)

    if n == n_1_plus:

        return float(1)

    elif n == n_2_plus:

        return float(1)

    # eij = (n_i_plus)(n_plus_j)/n
    e11 = (n_1_plus) * (n_plus_1) / n
    e12 = (n_1_plus) * (n_plus_2) / n
    e21 = (n_2_plus) * (n_plus_1) / n
    e22 = (n_2_plus) * (n_plus_2) / n

    chi2 = (math.pow(n11 - e11, 2) / e11) + (math.pow(n12 - e12, 2) / e12) + (
        math.pow(n21 - e21, 2) / e21) + (math.pow(n22 - e22, 2) / e22)

    p_value = chisqprob(chi2, 1)

    return p_value
Beispiel #29
0
def likelihood_ratio_test(ll_min: float, ll_max: float, dof_min: int,
                          dof_max: int) -> (float, float):
    """
    Assesses the goodness of fit of two competing statistical models based on the ratio of their likelihoods.
    

    Parameters
    ----------
    ll_min : float
        Likelihood of the less complex model.
    ll_max : float
        Likelihood of the more complex model.
    dof_min : int
        Degrees of freedom of the less complex model.
    dof_max : int
        Degrees of freedom of the more complex model.

    Returns
    -------
    (float, float)
        lr: Likelihood ratio.
        p: p Value.

    """
    lr = 2 * (ll_max - ll_min)
    delta_dof = dof_max - dof_min
    p = stats.chisqprob(lr, delta_dof)
    return (lr, p)
Beispiel #30
0
def englishness(s):
	if s is None:
		return 0
	from scipy.stats import chisqprob
	#uses a chi square algorithm to match the relative charcter frequencies
	#in the test string to that of real english
	score = 0
	s = s.lower()
	for c in EXTRA_CHARS:
		s = s.replace(c,'')#completely ignore characters in EXTRA_CHAR
	frequency = defaultdict(float)
	ignored = 0 #ignore, but only for purpose of chisquare computation
	length = len(s)
	for i in s:
		#analyze each character
		if i not in string.printable: #non-printables are bad
			score += 2
			ignored += 1
		elif i in string.digits: #digits arent that bad
			score += 0.5
			ignored += 1
		elif i not in string.ascii_lowercase and i not in ' ':#special chars are eh
			score += 1
			ignored += 1
		else:
			frequency[i] += 1 #analyze alphabetic frequencies.
		
	for i in frequency:
		freq = frequency[i] / (length - ignored)
		# Chi square
		score += pow((freq - CHAR_FREQ[i]/100), 2) / (CHAR_FREQ[i]/100)
	if not score:
		return 0
	return chisqprob(score,1) * 100 #return probability
Beispiel #31
0
def hardy_weinberg_asymptotic(obs_het, obs_a, obs_b):
    obs_het = float(obs_het)
    obs_a = float(obs_a)
    obs_b = float(obs_b)
    sample_size = obs_het + obs_a + obs_b
    p = (((2 * obs_a) + obs_het) / (2 * (sample_size)))
    q = 1 - p
    exp_a = p * p * sample_size
    exp_b = q * q * sample_size
    exp_ab = 2 * p * q * sample_size

    # get chiSquare values
    if (exp_a == 0):
        chi_a = 0
    else:
        chi_a = ((obs_a - exp_a) * 2.0) / exp_a
    if (exp_b == 0):
        chi_b = 0
    else:
        chi_b = ((obs_b - exp_b) * 2.0) / exp_b
    if (exp_ab == 0):
        chi_ab = 0
    else:
        chi_ab = ((obs_het - exp_ab) * 2.0) / exp_ab
    chi_sq_total = chi_a + chi_b + chi_ab
    return stats.chisqprob(chi_sq_total, 1)
Beispiel #32
0
def get_most_likely_cn(combo, cn_lik, pval_cutoff):
    '''
    use the most likely phi state, unless p < cutoff when compared to the
    most likely clonal (phi=1) case (log likelihood ratio test)
    - in this case, pick the most CN state with the highest clonal likelihood
    '''
    cn_lik_phi, cn_lik_clonal = cn_lik
    ll_phi, ll_clonal = cn_lik_phi[1], cn_lik_clonal[1]

    empty_result = [float('nan'), float('nan'), float('nan'), float('nan')]
    if len(combo) == 0:
        return empty_result
    elif len(combo) == 1:
        return combo[0]
    elif np.all(np.isnan(ll_phi)) and np.all(np.isnan(ll_clonal)):
        return empty_result
    elif np.all(np.isnan(ll_phi)):
        return combo[index_of_max(ll_clonal)]
    elif np.all(ll_phi == ll_clonal) or pval_cutoff == 0:
        return combo[index_of_max(ll_phi)]

    # log likelihood ratio test; null hypothesis = likelihood under phi
    # use clonal if best clonal solution significantly better than worst phi solution
    #LLR   = 2 * (np.nanmax(ll_clonal) - np.nanmax(ll_phi))
    LLR   = 2 * (np.nanmax(ll_clonal) - np.nanmin(ll_phi))
    p_val = stats.chisqprob(LLR, 1) if not np.isnan(LLR) else 1

    if p_val < pval_cutoff:
        return combo[index_of_max(ll_clonal)]
    else:
        return combo[index_of_max(ll_phi)]
Beispiel #33
0
def FisherMethodPvals(pvalues_array):
    # (adopted from a code by Arie Shaus, Nov 2016)
    pvalues_array = np.array(pvalues_array)
    k = len(pvalues_array)
    z = -2*sum(np.log(pvalues_array))
    combined_Pval = chisqprob(z,2*k)
    return combined_Pval
Beispiel #34
0
def wald_test(betas, r, q, vm):
    '''
    Chi sq. Wald statistic to test for restriction of coefficients.
    Implementation following Greene [Greene2003]_ eq. (17-24), p. 488

    ...

    Parameters
    ==========
    betas   : array
              kx1 array with coefficient estimates
    r       : array
              Array of dimension Rxk (R being number of restrictions) with constrain setup.
    q       : array
              Rx1 array with constants in the constraint setup. See Greene
              [1]_ for reference.
    vm      : array
              kxk variance-covariance matrix of coefficient estimates

    Returns
    =======
    w       : float
              Wald statistic
    pvalue  : float
              P value for Wald statistic calculated as a Chi sq. distribution
              with R degrees of freedom

    '''
    rbq = np.dot(r, betas) - q
    rvri = la.inv(np.dot(r, np.dot(vm, r.T)))
    w = np.dot(rbq.T, np.dot(rvri, rbq))[0][0]
    df = r.shape[0]
    pvalue = chisqprob(w, df)
    return w, pvalue
Beispiel #35
0
def check_sample_var(sv,n, popvar):
    # two-sided chisquare test for sample variance equal to hypothesized variance
    df = n-1
    chi2 = (n-1)*popvar/float(popvar)
    pval = stats.chisqprob(chi2,df)*2
    npt.assert_(pval > 0.01, 'var fail, t, pval = %f, %f, v, sv=%f, %f' %
            (chi2,pval,popvar,sv))
Beispiel #36
0
 def LR(self):
     if 'LR' not in self._cache:
         P = 1.0 * np.sum(self.y) / self.n
         LR = float(-2 * (self.n * (P * np.log(P) +
                                    (1 - P) * np.log(1 - P)) - self.logl))
         self._cache['LR'] = (LR, chisqprob(LR, self.k))
     return self._cache['LR']
Beispiel #37
0
def gtest(obs, exp, ddof=0):
    '''
    http://en.wikipedia.org/wiki/G-test
    test for goodness of fit to expected frequencies
    
    obs - observed fres
    exp - expected freqs
    ddof - delta dof

    returns
    chisquare statistic and p value
    
    based on https://gist.github.com/brentp/570896
    '''

    assert len(obs) == len(exp)
    assert not 0.0 in exp

    n = len(obs)

    g = 0.0
    for i in xrange(n):
        if obs[i] == 0.0: continue  #Oi * ln( Oi / Ei) == 0 if Oi == 0

        g += obs[i] * math.log(obs[i] / exp[i])

        if exp[i] < 5.0:
            sys.stderr.write("warning: expected value less than 5 in gtest\n")
    g *= 2.0

    return g, chisqprob(g, n - 1 - ddof)
def check_chisquare(f_obs, f_exp, ddof, axis, expected_chi2):
    # Use this only for arrays that have no masked values.
    f_obs = np.asarray(f_obs)
    if axis is None:
        num_obs = f_obs.size
    else:
        if axis == 'no':
            use_axis = 0
        else:
            use_axis = axis
        b = np.broadcast(f_obs, f_exp)
        num_obs = b.shape[use_axis]

    if axis == 'no':
        chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof)
    else:
        chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof, axis=axis)
    assert_array_equal(chi2, expected_chi2)

    ddof = np.asarray(ddof)
    expected_p = stats.chisqprob(expected_chi2, num_obs - 1 - ddof)
    assert_array_equal(p, expected_p)

    # Also compare to stats.chisquare
    if axis == 'no':
        stats_chisq, stats_p = stats.chisquare(f_obs, f_exp=f_exp, ddof=ddof)
    else:
        stats_chisq, stats_p = stats.chisquare(f_obs, f_exp=f_exp, ddof=ddof,
                                               axis=axis)
    assert_array_almost_equal(chi2, stats_chisq)
    assert_array_almost_equal(p, stats_p)
Beispiel #39
0
def lrtest(llmin, llmax):
    """
    Likelihood Ratio Test (LRT) by Joanna Diong
    https://scientificallysound.org/2017/08/24/the-likelihood-ratio-test-relevance-and-application/

    Example:

    # import example dataset
    data = sm.datasets.get_rdataset("dietox", "geepack").data

    # fit time only to pig weight
    md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"])
    mdf = md.fit(reml=False)
    print(mdf.summary())
    llf = mdf.llf

    # fit time and litter to pig weight
    mdlitter = smf.mixedlm("Weight ~ Time + Litter", data, groups=data["Pig"])
    mdflitter = mdlitter.fit(reml=False)
    print(mdflitter.summary())
    llflitter = mdflitter.llf

    lr, p = lrtest(llf, llflitter)
    print('LR test, p value: {:.2f}, {:.4f}'.format(lr, p))

    :param llmin: Log-likelihood of null model (the model without the variable we are considering to add).
    :param llmax: Log-likelihood of the alternative model (the model with the extra variable).
    :return: lr, p
    * lr: likelihood ratio
    * p: p-value to reject the hypothesis that the alternative model fits the data no better than the null model.
    """
    lr = 2 * (llmax - llmin)
    p = stats.chisqprob(lr, 1) # llmax has 1 dof more than llmin
    return lr, p
Beispiel #40
0
 def p_true(self):
     self.truelike = self.M_true.likelihood(self.M_true.simData)
     self.findmle(self.tau01_true, self.tau12_true, self.N_true)
     self.CS = 2.0 * (self.ml - self.truelike)  #plus log-likelihood
     numparam = 2
     self.pValue = stats.chisqprob(self.CS, numparam)
     return self.pValue
Beispiel #41
0
 def LR(self):
     if 'LR' not in self._cache:
         P = 1.0 * np.sum(self.y) / self.n
         LR = float(
             -2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl))
         self._cache['LR'] = (LR, chisqprob(LR, self.k))
     return self._cache['LR']
def test_for_pihm(data, guess=[10, -5], crof_params=None):
    """
    Test a dataset for parasite induced host mortality

    Parameters
    ----------
    data : array
        Hosts with a given parasite loads

    Returns
    -------
    :
    """

    # Get full nll
    params = likelihood_method(data, crof_params=crof_params, guess=guess)
    full_nll = likefxn1(params, data)


    if crof_params:
        mu, k = np.array(crof_params)[1:]
        red_nll = comp.nll(data, mod.nbinom(mu, k))

    else:
        mle_fit = mod.nbinom.fit_mle(data, k_array=np.linspace(.1, 2, 100))
        red_nll = comp.nll(data, mod.nbinom(*mle_fit))

    chi_sq = 2 * (-full_nll - (-red_nll))
    prob = chisqprob(chi_sq, 2)
    return chi_sq, prob, full_nll, red_nll
Beispiel #43
0
    def gof(self, x, y, ye):
        '''
        Computes GoF test statistics and other diagnostical tests

        Returns:
        --------
        - GoF test: Chi^2, p-value, and ddof
        - Normality of residuals: K^2 and p-value
        '''
        res = {}
        resid = y - self(x)
        chisq = np.sum(((resid) / ye) ** 2)
        ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters
        chisq_pvalue = chisqprob(chisq, ddof)
        gof = (chisq, chisq_pvalue, ddof)
        resid = normaltest(resid)
        ym = y.mean()
        SStot = np.sum((y - ym) ** 2)
        SSerr = np.sum((y - self(x)) ** 2)
        Rsquared = 1.0 - SSerr / SStot
# Besides being buggy, this test for homoscedasticity is supposed to work only
# for linear regressions, hence is not suited for our case, but I'll keep it
# here until I figure out an alternative. Remember to uncomment the import for
# OLS ontop.
#        regresults = OLS(resid ** 2, np.c_[x, x**2]).fit()
#        LM =regresults.rsquared 
#        LM_pvalue = chisqprob(LM, len(x) - ddof)
#        white = (LM, LM_pvalue)
#        return gof, resid, white 
        return gof, resid, Rsquared
Beispiel #44
0
    def __init__(self, statmatch):
        """
        Populate a pandas data frame and pass it forward as BalanceStatistics. Generally, operations are vectorized
        where possible and each method works on several covariates from a statistical matching routine at a time.
        :param statmatch: StatisticalMatching instance that has been fitted
        :return: BalanceStatistics instance
        """

        # Could be replaced with an ordered dictionary
        columns = [
            'unmatched_treated_mean', 'unmatched_control_mean',
            'unmatched_bias', 'unmatched_t_statistic', 'unmatched_p_value',
            'matched_treated_mean', 'matched_control_mean', 'matched_bias',
            'matched_t_statistic', 'matched_p_value', 'bias_reduction'
        ]

        data = {
            'unmatched_treated_mean': self._unmatched_treated_mean(statmatch),
            'unmatched_control_mean': self._unmatched_control_mean(statmatch),
            'unmatched_bias': self._unmatched_bias(statmatch),
            'unmatched_t_statistic': self._unmatched_t_statistic(statmatch),
            'unmatched_p_value': self._unmatched_p_value(statmatch),
            'matched_treated_mean': self._matched_treated_mean(statmatch),
            'matched_control_mean': self._matched_control_mean(statmatch),
            'matched_bias': self._matched_bias(statmatch),
            'matched_t_statistic': self._matched_t_statistic(statmatch),
            'matched_p_value': self._matched_p_value(statmatch),
            'bias_reduction': self._bias_reduction(statmatch)
        }

        # dataframe with column defined above
        super(BalanceStatistics, self).__init__(data,
                                                index=statmatch.names,
                                                columns=columns)

        # Whenever it becomes a problem that we have three copies of how to run regression, we can refactor this into another class
        fitted_reg = self._fit_unmatched_regression(statmatch)
        self.unmatched_prsquared = 1 - fitted_reg.llf / fitted_reg.llnull
        self.unmatched_llr = -2 * (fitted_reg.llnull - fitted_reg.llf)
        self.unmatched_llr_pvalue = chisqprob(self.unmatched_llr,
                                              fitted_reg.df_model)

        fitted_reg = self._fit_matched_regression(statmatch)
        self.matched_prsquared = 1 - fitted_reg.llf / fitted_reg.llnull
        self.matched_llr = -2 * (fitted_reg.llnull - fitted_reg.llf)
        self.matched_llr_pvalue = chisqprob(self.matched_llr,
                                            fitted_reg.df_model)
Beispiel #45
0
 def hessTest(self, L):
     matrixList = L[:]
     for xy in matrixList:
         xy = numpy.matrix(xy).T
         xy[0] -= self.MLE[0]
         xy[1] -= self.MLE[1]
         CS = xy.T * self.H * xy
         print 'p-value:', stats.chisqprob(CS, 2)
Beispiel #46
0
def fisher_combine(pvals):
    """ combined fisher probability with correction
    Use fdr correction for 25 comparisons using rpy2"""
    if all(p == "NA" for p in pvals): return np.nan
    pvals = [p for p in pvals if p != "NA"]
    if len(pvals) == 1: return pvals[0]
    s = -2 * np.sum(np.log(pvals))
    return chisqprob(s, 2 * len(pvals))
def likelihood_ratio_test(counts, model1, model2):
    # see formula <http://en.wikipedia.org/wiki/Likelihood-ratio_test>
    print 'Test %s and %s' % (model1.name, model2.name)
    D = -2 * (model1.lnL - model2.lnL)
    df = model2.df - model1.df
    p_value = chisqprob(D, df) 

    print 'D = %.1f, df = %d, P-value = %.2g' % (D, df, p_value)
Beispiel #48
0
def fisher_combine(pvals):
    """ combined fisher probability with correction
    Use fdr correction for 25 comparisons using rpy2"""
    if all(p == "NA" for p in pvals): return np.nan
    pvals = [p for p in pvals if p != "NA"]
    if len(pvals) == 1: return pvals[0]
    s = -2 * np.sum(np.log(pvals))
    return chisqprob(s, 2 * len(pvals))
def likelihood_ratio_test(counts, model1, model2):
    # see formula <http://en.wikipedia.org/wiki/Likelihood-ratio_test>
    print 'Test %s and %s' % (model1.name, model2.name)
    D = -2 * (model1.lnL - model2.lnL)
    df = model2.df - model1.df
    p_value = chisqprob(D, df)

    print 'D = %.1f, df = %d, P-value = %.2g' % (D, df, p_value)
Beispiel #50
0
def chisquare(obs, exp="uniform"):
    """Compute the chisquare value of a contingency table with arbitrary
    dimensions.

    Parameters
    ----------
    obs : array
      Observations matrix
    exp : ('uniform', 'indep_rows') or array, optional
      Matrix of expected values of the same size as `obs`.  If no
      array is given, then for 'uniform' -- evenly distributes all
      observations.  In 'indep_rows' case contingency table takes into
      account frequencies relative across different columns, so, if
      the contingency table is predictions vs targets, it would
      account for dis-balance among different targets.  Although
      'uniform' is the default, for confusion matrices 'indep_rows' is
      preferable.

    Returns
    -------
    tuple
     chisquare-stats, associated p-value (upper tail)
    """
    obs = np.array(obs)

    # get total number of observations
    nobs = np.sum(obs)

    # if no expected value are supplied assume equal distribution
    if not isinstance(exp, np.ndarray):
        ones = np.ones(obs.shape, dtype=float)
        if exp == "indep_rows":
            # multiply each column
            exp = np.sum(obs, axis=0)[None, :] * ones / obs.shape[0]
        elif exp == "indep_cols":
            # multiply each row
            exp = np.sum(obs, axis=1)[:, None] * ones / obs.shape[1]
        elif exp == "uniform":
            # just evenly distribute
            exp = nobs * np.ones(obs.shape, dtype=float) / np.prod(obs.shape)
        else:
            raise ValueError, "Unknown specification of expected values exp=%r" % (exp,)
    else:
        assert exp.shape == obs.shape

    # make sure to have floating point data
    exp = exp.astype(float)

    # compute chisquare value
    exp_zeros = exp == 0
    exp_nonzeros = np.logical_not(exp_zeros)
    if np.sum(exp_zeros) != 0 and (obs[exp_zeros] != 0).any():
        raise ValueError, "chisquare: Expected values have 0-values, but there are actual" " observations -- chi^2 cannot be computed"
    chisq = np.sum(((obs - exp) ** 2)[exp_nonzeros] / exp[exp_nonzeros])

    # return chisq and probability (upper tail)
    # taking only the elements with something expected
    return chisq, st.chisqprob(chisq, np.sum(exp_nonzeros) - 1)
Beispiel #51
0
def hessEval(r, theta, nbf, alpha=0.0):
    global H, MLE
    x = r * math.cos(theta)
    y = r * math.sin(theta)
    # print 'x', x, 'y', y
    # HI = numpy.matrix(H).I
    xy = numpy.matrix([[x], [y]])
    CS = xy.T * H * xy
    return stats.chisqprob(CS, 2) - alpha
Beispiel #52
0
def chi2(BET, feature_1, feature_2):

    l = (len(BET))
    BET.reset_index(drop=True, inplace=True)
    x = BET.to_dict(orient='list')
    keys = list(x.keys())
    obs_freq = {}
    exp_freq = {}
    sum_exp_freq_vertical = np.zeros(len(feature_2))
    chi2 = 0

    for i in range(len(feature_1)):
        obs_freq[feature_1[i]] = []

        for j in range(len(feature_2)):
            col1 = (feature_1[i])
            col2 = (feature_2[j])
            sumx = x[col1][keys.index(col2)][10]
            obs_freq[feature_1[i]].append(sumx)

        sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(
            obs_freq[feature_1[i]])
    total_in_contingency = sum(sum_exp_freq_vertical)

    for i in range(len(feature_1)):
        exp_freq[feature_1[i]] = []
        sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]])
        for j in range(len(feature_2)):
            e = (sum_exp_freq_horizontal *
                 sum_exp_freq_vertical[j]) / total_in_contingency
            exp_freq[feature_1[i]].append(e)

    for i in range(len(feature_1)):
        for j in range(len(feature_2)):
            chi2 = chi2 + (
                (obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**
                2) / exp_freq[feature_1[i]][j]

    df = (len(feature_1) - 1) * (len(feature_2) - 1)

    print('chi2: ' + str(chi2))
    print('df: ' + str(df))
    print('chisqprob: ' + str(chisqprob(chi2, df)))
    return (chisqprob(chi2, df))
Beispiel #53
0
def calculateCombinedFisher(significanceValuesList):
    #X^2_2k ~ -2 * sum(ln(p_i))

    accumulatedValue = 0
    for significanceValues in significanceValuesList:
        accumulatedValue += log(significanceValues[2])

    accumulatedValue = accumulatedValue * -2

    return (chisqprob(accumulatedValue, 2 * len(significanceValuesList)))
def chi_square_shape(hist_1, hist_2):
    n1 = np.sum(hist_1)
    n2 = np.sum(hist_2)
    diff_1 = (hist_1/n1) - (hist_2/n2)
    sum_1 = (hist_1/(n1*n1)) + (hist_2/(n2*n2))
    val = np.nansum((np.power(diff_1, 2))/sum_1)
    ddof = len(hist_1) - 1

    print ('T = {}'.format(val))
    print('P(chi^2 > T) = {}'.format(chisqprob(val, ddof)))
Beispiel #55
0
 def combine_fisher(self, pvalue1, pvalue2):
     """ Combine two p-values using Fihser's method. See
         https://en.wikipedia.org/wiki/Fisher%27s_method for
         more details
     """
     if pvalue1 == 0.0 or pvalue2 == 0.0:
         return 0.0
     chi = -2.0 * (math.log(pvalue1) + math.log(pvalue2))
     p_out = chisqprob(chi, 4)
     return p_out
Beispiel #56
0
def LRT(ll1, ll2, df):
    """
	Calculates likelihood ratio test between two models.
	:params ll1, ll2: likelihood of the two models studied
	:param df: degrees of freedom of difference between the two models
	"""
    LR = abs(2 * (ll1 - ll2))
    stats.chisqprob = lambda chisq, df: stats.chi2.sf(LR, df)
    p = stats.chisqprob(LR, df)
    return (LR, p)
Beispiel #57
0
def chisq_2sam(f_obs1, f_obs2):
    """
    Calculates a two-sample chi square test.
       
    The two samples chi square test tests the null hypothesis that the two
    categorical data sample have the same frequencies.
    
    Parameters
    ----------
    f_obs1, f_obs2 : two arrays
        with observed frequencies in each category. The number of categories
        must be the same.
    
    Returns
    -------
    chisquare statistic : float
        The chisquare test statistic
    p : float
        The p-value of the test.
    
    Notes
    -----
    If the number of observation is the same across the two samples, then the
    number of degrees of freedom is equal to the number of bins minus one (due
    to the additional constraint on the sample size), else it is equal to the
    number of bins. The same observations on the size of the sample in
    the one-way chi squared test (see scipy.stats.chisquare) apply also for the
    case with two samples.
    
    Examples
    --------
    >>> chisq2sam(np.ones(10), np.ones(10)) # same frequencies
    (0.0, 1.0)
    >>> chi2, pval = chisq2sam([100,0, 0], [0, 0, 100])
    >>> print chi2
    200.0
    >>> print pval
    2.08848758376e-45
    """
    if len(f_obs1) != len(f_obs2):
        raise ValueError('expecting same number of bins')
    f_obs1, f_obs_2 = np.asarray(f_obs1, dtype=int), np.asarray(f_obs2, dtype=int)
    s1, s2 = np.sum(f_obs1), np.sum(f_obs2)
    if s1 == s2:
        ksntrns = 1 
    else:
        ksntrns = 0
    idx = ( f_obs1 + f_obs2 ) == 0.
    ksntrns += np.sum(idx.astype(int))
    ddof = len(f_obs1) - ksntrns
    ratio1, ratio2 = map(np.sqrt, [ s2 / s1, s1 / s2 ] )
    chisq = (( f_obs1 * ratio1 ) - ( f_obs2 * ratio2 ))**2 / ( f_obs1 + f_obs2 )
    chisq = np.sum(chisq[~idx])
    return chisq, chisqprob(chisq, ddof)
Beispiel #58
0
def Fisher_combination_Pvals(pvalues_array):
    pvalues_array = np.array(pvalues_array)
    z=0
    for pval in pvalues_array:
        if pval > 1.e-20:
            z += -2*np.log(pval)
        else:
            z += -2*np.log(1.e-20)
    k = len(pvalues_array)
    combined_Pval = chisqprob(z,2*k)
    return combined_Pval