def test_chisquare_masked_arrays(): # The other tests were taken from the tests for stats.chisquare, so # they don't test the function with masked arrays. Here masked arrays # are tested. obs = np.array([[8, 8, 16, 32, -1], [-1, -1, 3, 4, 5]]).T mask = np.array([[0, 0, 0, 0, 1], [1, 1, 0, 0, 0]]).T mobs = ma.masked_array(obs, mask) expected_chisq = np.array([24.0, 0.5]) chisq, p = mstats.chisquare(mobs) assert_array_equal(chisq, expected_chisq) assert_array_almost_equal( p, stats.chisqprob(expected_chisq, mobs.count(axis=0) - 1)) chisq, p = mstats.chisquare(mobs.T, axis=1) assert_array_equal(chisq, expected_chisq) assert_array_almost_equal( p, stats.chisqprob(expected_chisq, mobs.T.count(axis=1) - 1)) # When axis=None, the two values should have type np.float64. chisq, p = mstats.chisquare([1, 2, 3], axis=None) assert_(isinstance(chisq, np.float64)) assert_(isinstance(p, np.float64)) assert_equal(chisq, 1.0) assert_almost_equal(p, stats.chisqprob(1.0, 2))
def sp_tests(reg): """ Calculates tests for spatial dependence in Probit models Parameters ---------- reg : regression object output instance from a probit model """ if reg.w: w = reg.w.sparse Phi = reg.predy phi = reg.phiy #Pinkse_error: Phi_prod = Phi * (1 - Phi) u_naive = reg.u_naive u_gen = reg.u_gen sig2 = np.sum((phi * phi) / Phi_prod) / reg.n LM_err_num = np.dot(u_gen.T,(w * u_gen))**2 trWW = np.sum((w*w).diagonal()) trWWWWp = trWW + np.sum((w*w.T).diagonal()) LM_err = float(1.0 * LM_err_num / (sig2**2 * trWWWWp)) LM_err = np.array([LM_err,chisqprob(LM_err,1)]) #KP_error: moran = moran_KP(reg.w,u_naive,Phi_prod) #Pinkse-Slade_error: u_std = u_naive / np.sqrt(Phi_prod) ps_num = np.dot(u_std.T, (w * u_std))**2 trWpW = np.sum((w.T*w).diagonal()) ps = float(ps_num / (trWW + trWpW)) ps = np.array([ps,chisqprob(ps,1)]) #chi-square instead of bootstrap. else: raise Exception, "W matrix not provided to calculate spatial test." return LM_err,moran,ps
def inspect_output_by_filter(self, rez, dat, doplot=False, test=False, sig_clips=[5, 3, 2], sig_test=[False, False, True]): p = rez.values()[0][1] myoutput = rez.values()[0][0] new = rez.values()[0][2] filt = rez.keys()[0] ret = {} ret.update({"all": self._extract_info(p, myoutput.sd_beta, myoutput)}) err = dat[2] tmp = (dat[1] - self.modelfunc_small_te(p, dat[0])) / err dof = tmp.shape[0] - myoutput.beta.shape[0] chisq = (tmp**2).sum() ret['all'].update({"ndata": dat[0].shape[0], \ "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof), "normalcy_prob": normaltest(tmp)[1]}) for s in enumerate(sig_clips): if sig_test[s[0]] and not test: continue sig = s[1] # get the indices of those inside and out of the clip area tmpisig = (abs(tmp) < sig).nonzero()[0] tmpisige = (abs(tmp) > sig).nonzero()[0] frac_less_than_sig = float(tmpisig.shape[0]) / dat[0].shape[0] # print frac_less_than_sig if frac_less_than_sig < 1.0: out = self._filt_run([dat[0][tmpisig],dat[1][tmpisig],err[tmpisig]],\ filt,do_sim=False,vplot=False) p = out[1] myoutput = out[0] t = "-test" if sig_test[s[0]] else "" ret.update({ "sig" + str(sig) + t: self._extract_info(p, myoutput.sd_beta, myoutput) }) tmp = (dat[1][tmpisig] - self.modelfunc_small_te( p, dat[0][tmpisig])) / err[tmpisig] dof = tmp.shape[0] - myoutput.beta.shape[0] chisq = (tmp**2).sum() try: ntest = normaltest(tmp)[1] except: ntest = 0.0 ret["sig" + str(sig) + t].update({"ndata": dat[0][tmpisig].shape[0], \ "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof), "normalcy_prob": ntest, "frac_data_remaining": frac_less_than_sig }) if doplot: plot(dat[0][tmpisige], dat[1][tmpisige], ".") return ret
def _calculate_LRTs(self): """Run likelihood ratio test if there are enough results """ if all([m in self.keys() for m in [1, 2]]): D = -2 * self[1].lnL + 2 * self[2].lnL pval = chisqprob(D, 2) self.LRT_m1m2 = (D, pval) if all([m in self.keys() for m in [7, 8]]): D = -2 * self[7].lnL + 2 * self[8].lnL pval = chisqprob(D, 2) self.LRT_m7m8 = (D, pval)
def _calculate_LRTs(self): """Run likelihood ratio test if there are enough results """ if all( [m in self.keys() for m in [1,2]] ): D = -2 * self[1].lnL + 2 * self[2].lnL pval = chisqprob(D,2) self.LRT_m1m2 = (D, pval) if all( [m in self.keys() for m in [7,8]] ): D = -2 * self[7].lnL + 2 * self[8].lnL pval = chisqprob(D,2) self.LRT_m7m8 = (D, pval)
def LR(self): try: return self._cache['LR'] except AttributeError: self._cache = {} P = 1.0 * np.sum(self.y) / self.n LR = float(-2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl)) self._cache['LR'] = (LR, chisqprob(LR, self.k)) except KeyError: P = 1.0 * np.sum(self.y) / self.n LR = float(-2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl)) self._cache['LR'] = (LR, chisqprob(LR, self.k)) return self._cache['LR']
def LR(self): try: return self._cache['LR'] except AttributeError: self._cache = {} P = 1.0 * np.sum(self.y) / self.n LR = float( -2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl)) self._cache['LR'] = (LR, chisqprob(LR, self.k)) except KeyError: P = 1.0 * np.sum(self.y) / self.n LR = float( -2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl)) self._cache['LR'] = (LR, chisqprob(LR, self.k)) return self._cache['LR']
def inspect_output_by_filter(self,rez,dat,doplot=False,test=False, sig_clips=[5, 3, 2], sig_test=[False,False,True]): p = rez.values()[0][1] myoutput = rez.values()[0][0] new = rez.values()[0][2] filt = rez.keys()[0] ret = {} ret.update({"all": self._extract_info(p,myoutput.sd_beta,myoutput)}) err = dat[2] tmp = (dat[1] - self.modelfunc_small_te(p,dat[0]))/err dof = tmp.shape[0] - myoutput.beta.shape[0] chisq = (tmp**2).sum() ret['all'].update({"ndata": dat[0].shape[0], \ "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof), "normalcy_prob": normaltest(tmp)[1]}) for s in enumerate(sig_clips): if sig_test[s[0]] and not test: continue sig = s[1] # get the indices of those inside and out of the clip area tmpisig = (abs(tmp) < sig).nonzero()[0] tmpisige = (abs(tmp) > sig).nonzero()[0] frac_less_than_sig = float(tmpisig.shape[0])/dat[0].shape[0] # print frac_less_than_sig if frac_less_than_sig < 1.0: out = self._filt_run([dat[0][tmpisig],dat[1][tmpisig],err[tmpisig]],\ filt,do_sim=False,vplot=False) p = out[1] myoutput = out[0] t = "-test" if sig_test[s[0]] else "" ret.update({"sig" + str(sig) + t: self._extract_info(p,myoutput.sd_beta,myoutput)}) tmp = (dat[1][tmpisig] - self.modelfunc_small_te(p,dat[0][tmpisig]))/err[tmpisig] dof = tmp.shape[0] - myoutput.beta.shape[0] chisq = (tmp**2).sum() try: ntest = normaltest(tmp)[1] except: ntest = 0.0 ret["sig" + str(sig) + t].update({"ndata": dat[0][tmpisig].shape[0], \ "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof), "normalcy_prob": ntest, "frac_data_remaining": frac_less_than_sig }) if doplot: plot(dat[0][tmpisige],dat[1][tmpisige],".") return ret
def pfisher( pvalues ): """ Combine independent P-values into one according to Fisher, R. A. (1948) Combining independent tests of significance. American Statistician, vol. 2, issue 5, page 30. ('Fisher method' or 'inverse ChiSquare method') See also book: Walter W. Piegorsch, A. John Bailer: Analyzing Environmental Data. Wiley 2005 @param pvalues: list of independent P-values @type pvalues: [ float ] @return: P-value @rtype: float """ ## stats.mannwhitneyu minimal P ~ stats.zprob( 8.2 ); ## all below becomes 0. which is not handled by the fisher test clipped = N.clip( pvalues, 1.0e-16, 1.0 ) x2 = -2 * N.sum( N.log( clipped ) ) if not USING_SCIPY: x2 = float( x2 ) return stats.chisqprob( x2, 2*len(pvalues) )
def _calculate_hwe(self, snp, genotypes): """ Calculates p-value for HWE using ChiSquare statistic: remove missing, get observed counts, get observed frequencies, get expected counts, calculate test values using (O-E)**2 / E and return ChiSquare probability with 1 degree of Freedom (bi-allelic SNP). """ adjusted_samples = self.sample_size - genotypes.count(self.missing) hetero_obs, major_obs, minor_obs = self._get_observed(genotypes) try: p = (major_obs + (hetero_obs/2)) / adjusted_samples q = (minor_obs + (hetero_obs/2)) / adjusted_samples except ZeroDivisionError: #print("Detected complete missing data in SNP:", snp) return 0 if (p + q) != 1: ValueError("Sum of observed allele frequencies (p + q) does not equal one.") hetero_exp, major_exp, minor_exp = self._get_expected(p, q, adjusted_samples) try: hetero_test = ((hetero_obs-hetero_exp)**2)/hetero_exp major_test = ((major_obs-major_exp)**2)/major_exp minor_test = ((minor_obs-minor_exp)**2)/minor_exp except ZeroDivisionError: return 0 return stats.chisqprob(sum([hetero_test, major_test, minor_test]), 1)
def hardy_weinberg_asymptotic(obs_het, obs_a , obs_b): obs_het = float(obs_het) obs_a = float(obs_a) obs_b = float(obs_b) sample_size = obs_het + obs_a + obs_b p = (((2 * obs_a) + obs_het) / ( 2 * (sample_size))) q = 1 - p exp_a = p * p * sample_size exp_b = q * q * sample_size exp_ab = 2 * p * q * sample_size # get chiSquare values if(exp_a == 0): chi_a = 0 else: chi_a = ((obs_a - exp_a) * 2.0) / exp_a if(exp_b == 0): chi_b = 0 else: chi_b = ((obs_b - exp_b) * 2.0) / exp_b if(exp_ab == 0): chi_ab = 0 else: chi_ab = ((obs_het - exp_ab) * 2.0 ) / exp_ab chi_sq_total = chi_a + chi_b + chi_ab return stats.chisqprob(chi_sq_total, 1)
def gof(self, x, y, ye): ''' Computes GoF test statistics and other diagnostical tests Returns: -------- - GoF test: Chi^2, p-value, and ddof - Normality of residuals: K^2 and p-value ''' res = {} resid = y - self(x) chisq = np.sum(((resid) / ye)**2) ddof = len(x) - len(filter( None, self.errors())) # number of estimated parameters chisq_pvalue = chisqprob(chisq, ddof) gof = (chisq, chisq_pvalue, ddof) resid = normaltest(resid) ym = y.mean() SStot = np.sum((y - ym)**2) SSerr = np.sum((y - self(x))**2) Rsquared = 1.0 - SSerr / SStot # Besides being buggy, this test for homoscedasticity is supposed to work only # for linear regressions, hence is not suited for our case, but I'll keep it # here until I figure out an alternative. Remember to uncomment the import for # OLS ontop. # regresults = OLS(resid ** 2, np.c_[x, x**2]).fit() # LM =regresults.rsquared # LM_pvalue = chisqprob(LM, len(x) - ddof) # white = (LM, LM_pvalue) # return gof, resid, white return gof, resid, Rsquared
def test_for_pihm_w_likelihood(self, guess=[10, -5], k_array=np.linspace(0.05, 2, 100), fixed_pre=False, disp=True): """ Test a dataset for parasite induced host mortality using the likelihood method This method compares a reduced model (negative binomial distribution) to a full model (Negative binomail with PIHM). The two models are nested and differ by two parameters: a and b. This amounts to fitting the model a negative binomial distribution to the data, then fitting the full model to the data and comparing likelihoods using a likelihood ratio test. The likelihood ratio should be approximately chi-squared with the dof equal to the difference in the parameters. Parameters ---------- guess : list Guesses for a and b k_array : array Array of values over which to search to best fit k fixed_pre : bool If True, the premortality parameters are fixed (mup and kp). Else, they are jointly estimated from the data disp : bool If True, convergence message is printed. If False, no convergence method is printed Returns ------- : chi_squared valued, p-value, full nll, reduced nll, """ # No params are known if not fixed_pre: # Get full nll params = self.likelihood_method(full_fit=True, guess=guess, disp=disp) full_nll = likefxn1(params, self.data) mle_fit = mod.nbinom.fit_mle(self.data, k_array=k_array) red_nll = comp.nll(self.data, mod.nbinom(*mle_fit)) # Params are known else: params = self.likelihood_method(full_fit=False, guess=guess, disp=disp) full_nll = likefxn2(params[2:], self.data, self.mup, self.kp) red_nll = comp.nll(self.data, mod.nbinom(self.mup, self.kp)) # Approximately chi-squared...though this is a large sample size approx chi_sq = 2 * (-full_nll - (-red_nll)) prob = chisqprob(chi_sq, 2) return chi_sq, prob, full_nll, red_nll, params, (mup, kp, a, b)
def get_pValue(self, mutheta, sigma=None): minusPointLogLike = self.like(mutheta, sigma) minusMaxLogLike = self.ml size = 2 CS = 2.0 * (minusPointLogLike - minusMaxLogLike) pValue = stats.chisqprob(CS, size) return pValue
def fit1D(binnedE, binnedT, fracsE, fracsT, nevents, PearsonErrs=True, debug=False): # Concatenate data and prediction vectors datavec = np.append(binnedE,binnedT) predvec = [np.append(fracsE[0],fracsT[0]), np.append(fracsE[1],fracsT[1])] # Define inputs for minimizer predfunc = lambda p: p[0]*predvec[0] + p[1]*predvec[1] func = lambda : 1 if PearsonErrs: func = lambda p: (datavec - predfunc(p))/np.sqrt(predfunc(p)) else: func = lambda p: (datavec - predfunc(p))/np.sqrt(datavec) pfit, pcov, infodict, errmsg, success = sp.optimize.leastsq(func, nevents, full_output=1) chi2 = sum([elem**2 for elem in infodict['fvec']]) #mychi2 = sum([(datavec[i] - predfunc(pfit)[i])**2./predfunc(pfit)[i] for i in xrange(len(datavec))]) ### This just equals 'chi2' calculated above dof = datavec.size - 2 pval = st.chisqprob(chi2, dof) if debug: print '---------------------- 1-D Fit --------------------------------' print 'Best fits: %s' % pfit print 'Cov. mat.: %s' % pcov print 'Chi^2: %s' % chi2 print 'd.o.f.: %s' % dof print 'P-value: %s' % pval print '---------------------------------------------------------------' return pfit, pcov, chi2, pval
def chisq_poisson(data): ''' Tests if the data comes from a Poisson distribution. This is done using the Pearson Chi-Square test. Each value from the data given is treated like a categorical attribute, where the number of occurrences of the value is tested against the expected occurrences if the data came from a Poisson distribution. The hypothesis are: * H0 (null) - The data comes from a poisson distribution * H1 - The data does NOT comes from a poisson distribution Arguments --------- data: array like Array with observations Returns ------- (chi-square value, p-value): The chi-square value found and a p-value for the null hypothesis. Notes ----- This implementation does not do any special treatment for values with small number of occurrences. ''' all_freqs, expected_freqs = _poisson_inputs(data) chisq = stats.chisquare(all_freqs, expected_freqs)[0] pval = stats.chisqprob(chisq, len(all_freqs) - 2) return chisq, pval
def chisquare(obs, exp=None): """Compute the chisquare value of a contingency table with arbitrary dimensions. If no expected frequencies are supplied, the total N is assumed to be equally distributed across all cells. Returns: chisquare-stats, associated p-value (upper tail) """ obs = N.array(obs) # get total number of observations nobs = N.sum(obs) # if no expected value are supplied assume equal distribution if exp == None: exp = N.ones(obs.shape) * nobs / N.prod(obs.shape) # make sure to have floating point data exp = exp.astype(float) # compute chisquare value chisq = N.sum((obs - exp )**2 / exp) # return chisq and probability (upper tail) return chisq, stats.chisqprob(chisq, N.prod(obs.shape) - 1)
def computeContingencyTablePValue(*observedTuples): if len(observedTuples) == 0: return None rowSums = [] for row in observedTuples: rowSums.append(float(sum(row))) columnSums = [] for i in range(len(observedTuples[0])): columnSum = 0.0 for row in observedTuples: columnSum += row[i] columnSums.append(float(columnSum)) grandTotal = float(sum(rowSums)) observedTestStatistic = 0.0 for i in range(len(observedTuples)): for j in range(len(row)): expectedValue = (rowSums[i]/grandTotal)*(columnSums[j]/grandTotal)*grandTotal observedValue = float(observedTuples[i][j]) observedTestStatistic += ((observedValue - expectedValue)**2) / expectedValue degreesFreedom = (len(columnSums) - 1) * (len(rowSums) - 1) from scipy.stats import chisqprob return chisqprob(observedTestStatistic, degreesFreedom)
def chisquare(obs, exp='uniform'): """Compute the chisquare value of a contingency table with arbitrary dimensions. Parameters ---------- obs : array Observations matrix exp : ('uniform', 'indep_rows') or array, optional Matrix of expected values of the same size as `obs`. If no array is given, then for 'uniform' -- evenly distributes all observations. In 'indep_rows' case contingency table takes into account frequencies relative across different columns, so, if the contingency table is predictions vs targets, it would account for dis-balance among different targets. Although 'uniform' is the default, for confusion matrices 'indep_rows' is preferable. Returns ------- tuple chisquare-stats, associated p-value (upper tail) """ obs = np.array(obs) # get total number of observations nobs = np.sum(obs) # if no expected value are supplied assume equal distribution if not isinstance(exp, np.ndarray): ones = np.ones(obs.shape, dtype=float) if exp == 'indep_rows': # multiply each column exp = np.sum(obs, axis=0)[None, :] * ones / obs.shape[0] elif exp == 'indep_cols': # multiply each row exp = np.sum(obs, axis=1)[:, None] * ones / obs.shape[1] elif exp == 'uniform': # just evenly distribute exp = nobs * np.ones(obs.shape, dtype=float) / np.prod(obs.shape) else: raise ValueError( "Unknown specification of expected values exp=%r" % (exp, )) else: assert (exp.shape == obs.shape) # make sure to have floating point data exp = exp.astype(float) # compute chisquare value exp_zeros = exp == 0 exp_nonzeros = np.logical_not(exp_zeros) if np.sum(exp_zeros) != 0 and (obs[exp_zeros] != 0).any(): raise ValueError("chisquare: Expected values have 0-values, but there are actual" \ " observations -- chi^2 cannot be computed") chisq = np.sum(((obs - exp)**2)[exp_nonzeros] / exp[exp_nonzeros]) # return chisq and probability (upper tail) # taking only the elements with something expected return chisq, st.chisqprob(chisq, np.sum(exp_nonzeros) - 1)
def pfisher(pvalues): """ Combine independent P-values into one according to Fisher, R. A. (1948) Combining independent tests of significance. American Statistician, vol. 2, issue 5, page 30. ('Fisher method' or 'inverse ChiSquare method') See also book: Walter W. Piegorsch, A. John Bailer: Analyzing Environmental Data. Wiley 2005 @param pvalues: list of independent P-values @type pvalues: [ float ] @return: P-value @rtype: float """ ## stats.mannwhitneyu minimal P ~ stats.zprob( 8.2 ); ## all below becomes 0. which is not handled by the fisher test clipped = N.clip(pvalues, 1.0e-16, 1.0) x2 = -2 * N.sum(N.log(clipped)) if not USING_SCIPY: x2 = float(x2) return stats.chisqprob(x2, 2 * len(pvalues))
def combine_fisher(self, pvalue1, pvalue2): if pvalue1 == 0.0 or pvalue2 == 0.0: return 0.0 else: chi = -2.0 * (math.log(pvalue1) + math.log(pvalue2)) p_out = chisqprob(chi, 4) return p_out
def get_p_value_pearson_chi_squared(contingency_table): (n11,n12,n21,n22) = contingency_table n = n11+n12+n21+n22 if n == 0: raise "The contingency table is empty" n_1_plus = float(n11 + n12) n_2_plus = float(n21 + n22) n_plus_1 = float(n11 + n21) n_plus_2 = float(n12 + n22) if n == n_1_plus: return float(1) elif n == n_2_plus: return float(1) # eij = (n_i_plus)(n_plus_j)/n e11 = (n_1_plus)*(n_plus_1)/n e12 = (n_1_plus)*(n_plus_2)/n e21 = (n_2_plus)*(n_plus_1)/n e22 = (n_2_plus)*(n_plus_2)/n chi2 = (math.pow(n11-e11,2)/e11) + (math.pow(n12-e12,2)/e12) + (math.pow(n21-e21,2)/e21) + (math.pow(n22-e22,2)/e22) p_value = chisqprob(chi2,1) return p_value
def chi_square(hist_1, hist_2): diff_1 = hist_1 - hist_2 val = np.nansum((np.power(diff_1, 2))/(hist_1+hist_2)) ddof = len(hist_1) print('T = {}'.format(val)) print('P(chi^2 > T) = {}'.format(chisqprob(val, ddof)))
def _calculate_hwe(self, snp, genotypes): """ Calculates p-value for HWE using ChiSquare statistic: remove missing, get observed counts, get observed frequencies, get expected counts, calculate test values using (O-E)**2 / E and return ChiSquare probability with 1 degree of Freedom (bi-allelic SNP). """ adjusted_samples = self.sample_size - genotypes.count(self.missing) hetero_obs, major_obs, minor_obs = self._get_observed(genotypes) try: p = (major_obs + (hetero_obs / 2)) / adjusted_samples q = (minor_obs + (hetero_obs / 2)) / adjusted_samples except ZeroDivisionError: # print("Detected complete missing data in SNP:", snp) return 0 if (p + q) != 1: ValueError("Sum of observed allele frequencies (p + q) does not equal one.") hetero_exp, major_exp, minor_exp = self._get_expected(p, q, adjusted_samples) try: hetero_test = ((hetero_obs - hetero_exp) ** 2) / hetero_exp major_test = ((major_obs - major_exp) ** 2) / major_exp minor_test = ((minor_obs - minor_exp) ** 2) / minor_exp except ZeroDivisionError: return 0 return stats.chisqprob(sum([hetero_test, major_test, minor_test]), 1)
def check_chisquare(f_obs, f_exp, ddof, axis, expected_chi2): # Use this only for arrays that have no masked values. f_obs = np.asarray(f_obs) if axis is None: num_obs = f_obs.size else: if axis == 'no': use_axis = 0 else: use_axis = axis b = np.broadcast(f_obs, f_exp) num_obs = b.shape[use_axis] if axis == 'no': chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof) else: chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof, axis=axis) assert_array_equal(chi2, expected_chi2) ddof = np.asarray(ddof) expected_p = stats.chisqprob(expected_chi2, num_obs - 1 - ddof) assert_array_equal(p, expected_p) # Also compare to stats.chisquare if axis == 'no': stats_chisq, stats_p = stats.chisquare(f_obs, f_exp=f_exp, ddof=ddof) else: stats_chisq, stats_p = stats.chisquare(f_obs, f_exp=f_exp, ddof=ddof, axis=axis) assert_array_almost_equal(chi2, stats_chisq) assert_array_almost_equal(p, stats_p)
def plot_data_key(self, nm, conn, gnm=None): self.gather_points(nm, conn) self.gdat = [ [n, self.datpts[k].value * self.yscale, self.datpts[k].err * self.yscale] for (n,k) in enumerate(self.datkeys) if self.datpts[k].value is not None] try: self.LF.fit(self.gdat,cols=(0,1,2),errorbarWeights=True) chi2 = self.LF.chisquared() ndf = self.LF.nu() statdat = {"mu":self.LF.coeffs[0], "rms":self.LF.rmsDeviation(), "uncert":self.LF.coeffErr(0), "chi2":chi2 , "ndf":ndf} if stats: statdat["prob"] = stats.chisqprob(statdat["chi2"],statdat["ndf"]) else: statdat["prob"] = 0 if gnm: gnm = gnm%statdat print gnm self.g.plot(graph.data.function("y(x)=%g"%self.LF.coeffs[0], title=None), [ graph.style.line(lineattrs=[self.ptcolor,style.linestyle.dashed]),]) except: gnm = None self.g.axes['x'].max = self.gdat[-1][0] self.g.plot(graph.data.points(self.gdat,x=1,y=2,dy=3,title=gnm), [ graph.style.errorbar(errorbarattrs=[self.ptcolor]), graph.style.symbol(self.ptsymb, size=0.15, symbolattrs = [self.ptcolor])])
def walds_test(profile1, profile2): """Calculate the compatibility of two statistically independent measurements using normal approximation (Wald's method). This assumes that the log-likelihood space is approximately elliptically. Parameters ---------- profile1 : (x,y,llh) for measurement 1 profile2 : (x,y,llh) for measurement 2 """ from scipy.stats import chisqprob from scipy.special import erfinv bestfits, covariances = [], [] for x, y, llhs in [profile1, profile2]: idx_min = np.unravel_index(llhs.argmin(), llhs.shape) bestfit = x[idx_min[1]], y[idx_min[0]] bestfits.append(bestfit) covariance = estimate_cov_from_contour(x, y, llhs, bestfit) covariances.append(covariance) diff = np.matrix(bestfits[0]) - np.matrix(bestfits[1]) cov_inv = np.linalg.inv(covariances[0] + covariances[1]) chi2 = diff*cov_inv*diff.transpose() ndof = 2 pvalue = chisqprob(chi2, ndof) nsigma = erfinv(1-pvalue) * np.sqrt(2) # 2-sided significance return (chi2, ndof, pvalue, nsigma)
def get_p_value_pearson_chi_squared(contingency_table): (n11, n12, n21, n22) = contingency_table n = n11 + n12 + n21 + n22 if n == 0: raise "The contingency table is empty" n_1_plus = float(n11 + n12) n_2_plus = float(n21 + n22) n_plus_1 = float(n11 + n21) n_plus_2 = float(n12 + n22) if n == n_1_plus: return float(1) elif n == n_2_plus: return float(1) # eij = (n_i_plus)(n_plus_j)/n e11 = (n_1_plus) * (n_plus_1) / n e12 = (n_1_plus) * (n_plus_2) / n e21 = (n_2_plus) * (n_plus_1) / n e22 = (n_2_plus) * (n_plus_2) / n chi2 = (math.pow(n11 - e11, 2) / e11) + (math.pow(n12 - e12, 2) / e12) + ( math.pow(n21 - e21, 2) / e21) + (math.pow(n22 - e22, 2) / e22) p_value = chisqprob(chi2, 1) return p_value
def likelihood_ratio_test(ll_min: float, ll_max: float, dof_min: int, dof_max: int) -> (float, float): """ Assesses the goodness of fit of two competing statistical models based on the ratio of their likelihoods. Parameters ---------- ll_min : float Likelihood of the less complex model. ll_max : float Likelihood of the more complex model. dof_min : int Degrees of freedom of the less complex model. dof_max : int Degrees of freedom of the more complex model. Returns ------- (float, float) lr: Likelihood ratio. p: p Value. """ lr = 2 * (ll_max - ll_min) delta_dof = dof_max - dof_min p = stats.chisqprob(lr, delta_dof) return (lr, p)
def englishness(s): if s is None: return 0 from scipy.stats import chisqprob #uses a chi square algorithm to match the relative charcter frequencies #in the test string to that of real english score = 0 s = s.lower() for c in EXTRA_CHARS: s = s.replace(c,'')#completely ignore characters in EXTRA_CHAR frequency = defaultdict(float) ignored = 0 #ignore, but only for purpose of chisquare computation length = len(s) for i in s: #analyze each character if i not in string.printable: #non-printables are bad score += 2 ignored += 1 elif i in string.digits: #digits arent that bad score += 0.5 ignored += 1 elif i not in string.ascii_lowercase and i not in ' ':#special chars are eh score += 1 ignored += 1 else: frequency[i] += 1 #analyze alphabetic frequencies. for i in frequency: freq = frequency[i] / (length - ignored) # Chi square score += pow((freq - CHAR_FREQ[i]/100), 2) / (CHAR_FREQ[i]/100) if not score: return 0 return chisqprob(score,1) * 100 #return probability
def hardy_weinberg_asymptotic(obs_het, obs_a, obs_b): obs_het = float(obs_het) obs_a = float(obs_a) obs_b = float(obs_b) sample_size = obs_het + obs_a + obs_b p = (((2 * obs_a) + obs_het) / (2 * (sample_size))) q = 1 - p exp_a = p * p * sample_size exp_b = q * q * sample_size exp_ab = 2 * p * q * sample_size # get chiSquare values if (exp_a == 0): chi_a = 0 else: chi_a = ((obs_a - exp_a) * 2.0) / exp_a if (exp_b == 0): chi_b = 0 else: chi_b = ((obs_b - exp_b) * 2.0) / exp_b if (exp_ab == 0): chi_ab = 0 else: chi_ab = ((obs_het - exp_ab) * 2.0) / exp_ab chi_sq_total = chi_a + chi_b + chi_ab return stats.chisqprob(chi_sq_total, 1)
def get_most_likely_cn(combo, cn_lik, pval_cutoff): ''' use the most likely phi state, unless p < cutoff when compared to the most likely clonal (phi=1) case (log likelihood ratio test) - in this case, pick the most CN state with the highest clonal likelihood ''' cn_lik_phi, cn_lik_clonal = cn_lik ll_phi, ll_clonal = cn_lik_phi[1], cn_lik_clonal[1] empty_result = [float('nan'), float('nan'), float('nan'), float('nan')] if len(combo) == 0: return empty_result elif len(combo) == 1: return combo[0] elif np.all(np.isnan(ll_phi)) and np.all(np.isnan(ll_clonal)): return empty_result elif np.all(np.isnan(ll_phi)): return combo[index_of_max(ll_clonal)] elif np.all(ll_phi == ll_clonal) or pval_cutoff == 0: return combo[index_of_max(ll_phi)] # log likelihood ratio test; null hypothesis = likelihood under phi # use clonal if best clonal solution significantly better than worst phi solution #LLR = 2 * (np.nanmax(ll_clonal) - np.nanmax(ll_phi)) LLR = 2 * (np.nanmax(ll_clonal) - np.nanmin(ll_phi)) p_val = stats.chisqprob(LLR, 1) if not np.isnan(LLR) else 1 if p_val < pval_cutoff: return combo[index_of_max(ll_clonal)] else: return combo[index_of_max(ll_phi)]
def FisherMethodPvals(pvalues_array): # (adopted from a code by Arie Shaus, Nov 2016) pvalues_array = np.array(pvalues_array) k = len(pvalues_array) z = -2*sum(np.log(pvalues_array)) combined_Pval = chisqprob(z,2*k) return combined_Pval
def wald_test(betas, r, q, vm): ''' Chi sq. Wald statistic to test for restriction of coefficients. Implementation following Greene [Greene2003]_ eq. (17-24), p. 488 ... Parameters ========== betas : array kx1 array with coefficient estimates r : array Array of dimension Rxk (R being number of restrictions) with constrain setup. q : array Rx1 array with constants in the constraint setup. See Greene [1]_ for reference. vm : array kxk variance-covariance matrix of coefficient estimates Returns ======= w : float Wald statistic pvalue : float P value for Wald statistic calculated as a Chi sq. distribution with R degrees of freedom ''' rbq = np.dot(r, betas) - q rvri = la.inv(np.dot(r, np.dot(vm, r.T))) w = np.dot(rbq.T, np.dot(rvri, rbq))[0][0] df = r.shape[0] pvalue = chisqprob(w, df) return w, pvalue
def check_sample_var(sv,n, popvar): # two-sided chisquare test for sample variance equal to hypothesized variance df = n-1 chi2 = (n-1)*popvar/float(popvar) pval = stats.chisqprob(chi2,df)*2 npt.assert_(pval > 0.01, 'var fail, t, pval = %f, %f, v, sv=%f, %f' % (chi2,pval,popvar,sv))
def LR(self): if 'LR' not in self._cache: P = 1.0 * np.sum(self.y) / self.n LR = float(-2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl)) self._cache['LR'] = (LR, chisqprob(LR, self.k)) return self._cache['LR']
def gtest(obs, exp, ddof=0): ''' http://en.wikipedia.org/wiki/G-test test for goodness of fit to expected frequencies obs - observed fres exp - expected freqs ddof - delta dof returns chisquare statistic and p value based on https://gist.github.com/brentp/570896 ''' assert len(obs) == len(exp) assert not 0.0 in exp n = len(obs) g = 0.0 for i in xrange(n): if obs[i] == 0.0: continue #Oi * ln( Oi / Ei) == 0 if Oi == 0 g += obs[i] * math.log(obs[i] / exp[i]) if exp[i] < 5.0: sys.stderr.write("warning: expected value less than 5 in gtest\n") g *= 2.0 return g, chisqprob(g, n - 1 - ddof)
def lrtest(llmin, llmax): """ Likelihood Ratio Test (LRT) by Joanna Diong https://scientificallysound.org/2017/08/24/the-likelihood-ratio-test-relevance-and-application/ Example: # import example dataset data = sm.datasets.get_rdataset("dietox", "geepack").data # fit time only to pig weight md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"]) mdf = md.fit(reml=False) print(mdf.summary()) llf = mdf.llf # fit time and litter to pig weight mdlitter = smf.mixedlm("Weight ~ Time + Litter", data, groups=data["Pig"]) mdflitter = mdlitter.fit(reml=False) print(mdflitter.summary()) llflitter = mdflitter.llf lr, p = lrtest(llf, llflitter) print('LR test, p value: {:.2f}, {:.4f}'.format(lr, p)) :param llmin: Log-likelihood of null model (the model without the variable we are considering to add). :param llmax: Log-likelihood of the alternative model (the model with the extra variable). :return: lr, p * lr: likelihood ratio * p: p-value to reject the hypothesis that the alternative model fits the data no better than the null model. """ lr = 2 * (llmax - llmin) p = stats.chisqprob(lr, 1) # llmax has 1 dof more than llmin return lr, p
def p_true(self): self.truelike = self.M_true.likelihood(self.M_true.simData) self.findmle(self.tau01_true, self.tau12_true, self.N_true) self.CS = 2.0 * (self.ml - self.truelike) #plus log-likelihood numparam = 2 self.pValue = stats.chisqprob(self.CS, numparam) return self.pValue
def LR(self): if 'LR' not in self._cache: P = 1.0 * np.sum(self.y) / self.n LR = float( -2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl)) self._cache['LR'] = (LR, chisqprob(LR, self.k)) return self._cache['LR']
def test_for_pihm(data, guess=[10, -5], crof_params=None): """ Test a dataset for parasite induced host mortality Parameters ---------- data : array Hosts with a given parasite loads Returns ------- : """ # Get full nll params = likelihood_method(data, crof_params=crof_params, guess=guess) full_nll = likefxn1(params, data) if crof_params: mu, k = np.array(crof_params)[1:] red_nll = comp.nll(data, mod.nbinom(mu, k)) else: mle_fit = mod.nbinom.fit_mle(data, k_array=np.linspace(.1, 2, 100)) red_nll = comp.nll(data, mod.nbinom(*mle_fit)) chi_sq = 2 * (-full_nll - (-red_nll)) prob = chisqprob(chi_sq, 2) return chi_sq, prob, full_nll, red_nll
def gof(self, x, y, ye): ''' Computes GoF test statistics and other diagnostical tests Returns: -------- - GoF test: Chi^2, p-value, and ddof - Normality of residuals: K^2 and p-value ''' res = {} resid = y - self(x) chisq = np.sum(((resid) / ye) ** 2) ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters chisq_pvalue = chisqprob(chisq, ddof) gof = (chisq, chisq_pvalue, ddof) resid = normaltest(resid) ym = y.mean() SStot = np.sum((y - ym) ** 2) SSerr = np.sum((y - self(x)) ** 2) Rsquared = 1.0 - SSerr / SStot # Besides being buggy, this test for homoscedasticity is supposed to work only # for linear regressions, hence is not suited for our case, but I'll keep it # here until I figure out an alternative. Remember to uncomment the import for # OLS ontop. # regresults = OLS(resid ** 2, np.c_[x, x**2]).fit() # LM =regresults.rsquared # LM_pvalue = chisqprob(LM, len(x) - ddof) # white = (LM, LM_pvalue) # return gof, resid, white return gof, resid, Rsquared
def __init__(self, statmatch): """ Populate a pandas data frame and pass it forward as BalanceStatistics. Generally, operations are vectorized where possible and each method works on several covariates from a statistical matching routine at a time. :param statmatch: StatisticalMatching instance that has been fitted :return: BalanceStatistics instance """ # Could be replaced with an ordered dictionary columns = [ 'unmatched_treated_mean', 'unmatched_control_mean', 'unmatched_bias', 'unmatched_t_statistic', 'unmatched_p_value', 'matched_treated_mean', 'matched_control_mean', 'matched_bias', 'matched_t_statistic', 'matched_p_value', 'bias_reduction' ] data = { 'unmatched_treated_mean': self._unmatched_treated_mean(statmatch), 'unmatched_control_mean': self._unmatched_control_mean(statmatch), 'unmatched_bias': self._unmatched_bias(statmatch), 'unmatched_t_statistic': self._unmatched_t_statistic(statmatch), 'unmatched_p_value': self._unmatched_p_value(statmatch), 'matched_treated_mean': self._matched_treated_mean(statmatch), 'matched_control_mean': self._matched_control_mean(statmatch), 'matched_bias': self._matched_bias(statmatch), 'matched_t_statistic': self._matched_t_statistic(statmatch), 'matched_p_value': self._matched_p_value(statmatch), 'bias_reduction': self._bias_reduction(statmatch) } # dataframe with column defined above super(BalanceStatistics, self).__init__(data, index=statmatch.names, columns=columns) # Whenever it becomes a problem that we have three copies of how to run regression, we can refactor this into another class fitted_reg = self._fit_unmatched_regression(statmatch) self.unmatched_prsquared = 1 - fitted_reg.llf / fitted_reg.llnull self.unmatched_llr = -2 * (fitted_reg.llnull - fitted_reg.llf) self.unmatched_llr_pvalue = chisqprob(self.unmatched_llr, fitted_reg.df_model) fitted_reg = self._fit_matched_regression(statmatch) self.matched_prsquared = 1 - fitted_reg.llf / fitted_reg.llnull self.matched_llr = -2 * (fitted_reg.llnull - fitted_reg.llf) self.matched_llr_pvalue = chisqprob(self.matched_llr, fitted_reg.df_model)
def hessTest(self, L): matrixList = L[:] for xy in matrixList: xy = numpy.matrix(xy).T xy[0] -= self.MLE[0] xy[1] -= self.MLE[1] CS = xy.T * self.H * xy print 'p-value:', stats.chisqprob(CS, 2)
def fisher_combine(pvals): """ combined fisher probability with correction Use fdr correction for 25 comparisons using rpy2""" if all(p == "NA" for p in pvals): return np.nan pvals = [p for p in pvals if p != "NA"] if len(pvals) == 1: return pvals[0] s = -2 * np.sum(np.log(pvals)) return chisqprob(s, 2 * len(pvals))
def likelihood_ratio_test(counts, model1, model2): # see formula <http://en.wikipedia.org/wiki/Likelihood-ratio_test> print 'Test %s and %s' % (model1.name, model2.name) D = -2 * (model1.lnL - model2.lnL) df = model2.df - model1.df p_value = chisqprob(D, df) print 'D = %.1f, df = %d, P-value = %.2g' % (D, df, p_value)
def chisquare(obs, exp="uniform"): """Compute the chisquare value of a contingency table with arbitrary dimensions. Parameters ---------- obs : array Observations matrix exp : ('uniform', 'indep_rows') or array, optional Matrix of expected values of the same size as `obs`. If no array is given, then for 'uniform' -- evenly distributes all observations. In 'indep_rows' case contingency table takes into account frequencies relative across different columns, so, if the contingency table is predictions vs targets, it would account for dis-balance among different targets. Although 'uniform' is the default, for confusion matrices 'indep_rows' is preferable. Returns ------- tuple chisquare-stats, associated p-value (upper tail) """ obs = np.array(obs) # get total number of observations nobs = np.sum(obs) # if no expected value are supplied assume equal distribution if not isinstance(exp, np.ndarray): ones = np.ones(obs.shape, dtype=float) if exp == "indep_rows": # multiply each column exp = np.sum(obs, axis=0)[None, :] * ones / obs.shape[0] elif exp == "indep_cols": # multiply each row exp = np.sum(obs, axis=1)[:, None] * ones / obs.shape[1] elif exp == "uniform": # just evenly distribute exp = nobs * np.ones(obs.shape, dtype=float) / np.prod(obs.shape) else: raise ValueError, "Unknown specification of expected values exp=%r" % (exp,) else: assert exp.shape == obs.shape # make sure to have floating point data exp = exp.astype(float) # compute chisquare value exp_zeros = exp == 0 exp_nonzeros = np.logical_not(exp_zeros) if np.sum(exp_zeros) != 0 and (obs[exp_zeros] != 0).any(): raise ValueError, "chisquare: Expected values have 0-values, but there are actual" " observations -- chi^2 cannot be computed" chisq = np.sum(((obs - exp) ** 2)[exp_nonzeros] / exp[exp_nonzeros]) # return chisq and probability (upper tail) # taking only the elements with something expected return chisq, st.chisqprob(chisq, np.sum(exp_nonzeros) - 1)
def hessEval(r, theta, nbf, alpha=0.0): global H, MLE x = r * math.cos(theta) y = r * math.sin(theta) # print 'x', x, 'y', y # HI = numpy.matrix(H).I xy = numpy.matrix([[x], [y]]) CS = xy.T * H * xy return stats.chisqprob(CS, 2) - alpha
def chi2(BET, feature_1, feature_2): l = (len(BET)) BET.reset_index(drop=True, inplace=True) x = BET.to_dict(orient='list') keys = list(x.keys()) obs_freq = {} exp_freq = {} sum_exp_freq_vertical = np.zeros(len(feature_2)) chi2 = 0 for i in range(len(feature_1)): obs_freq[feature_1[i]] = [] for j in range(len(feature_2)): col1 = (feature_1[i]) col2 = (feature_2[j]) sumx = x[col1][keys.index(col2)][10] obs_freq[feature_1[i]].append(sumx) sum_exp_freq_vertical = sum_exp_freq_vertical + np.array( obs_freq[feature_1[i]]) total_in_contingency = sum(sum_exp_freq_vertical) for i in range(len(feature_1)): exp_freq[feature_1[i]] = [] sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]]) for j in range(len(feature_2)): e = (sum_exp_freq_horizontal * sum_exp_freq_vertical[j]) / total_in_contingency exp_freq[feature_1[i]].append(e) for i in range(len(feature_1)): for j in range(len(feature_2)): chi2 = chi2 + ( (obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])** 2) / exp_freq[feature_1[i]][j] df = (len(feature_1) - 1) * (len(feature_2) - 1) print('chi2: ' + str(chi2)) print('df: ' + str(df)) print('chisqprob: ' + str(chisqprob(chi2, df))) return (chisqprob(chi2, df))
def calculateCombinedFisher(significanceValuesList): #X^2_2k ~ -2 * sum(ln(p_i)) accumulatedValue = 0 for significanceValues in significanceValuesList: accumulatedValue += log(significanceValues[2]) accumulatedValue = accumulatedValue * -2 return (chisqprob(accumulatedValue, 2 * len(significanceValuesList)))
def chi_square_shape(hist_1, hist_2): n1 = np.sum(hist_1) n2 = np.sum(hist_2) diff_1 = (hist_1/n1) - (hist_2/n2) sum_1 = (hist_1/(n1*n1)) + (hist_2/(n2*n2)) val = np.nansum((np.power(diff_1, 2))/sum_1) ddof = len(hist_1) - 1 print ('T = {}'.format(val)) print('P(chi^2 > T) = {}'.format(chisqprob(val, ddof)))
def combine_fisher(self, pvalue1, pvalue2): """ Combine two p-values using Fihser's method. See https://en.wikipedia.org/wiki/Fisher%27s_method for more details """ if pvalue1 == 0.0 or pvalue2 == 0.0: return 0.0 chi = -2.0 * (math.log(pvalue1) + math.log(pvalue2)) p_out = chisqprob(chi, 4) return p_out
def LRT(ll1, ll2, df): """ Calculates likelihood ratio test between two models. :params ll1, ll2: likelihood of the two models studied :param df: degrees of freedom of difference between the two models """ LR = abs(2 * (ll1 - ll2)) stats.chisqprob = lambda chisq, df: stats.chi2.sf(LR, df) p = stats.chisqprob(LR, df) return (LR, p)
def chisq_2sam(f_obs1, f_obs2): """ Calculates a two-sample chi square test. The two samples chi square test tests the null hypothesis that the two categorical data sample have the same frequencies. Parameters ---------- f_obs1, f_obs2 : two arrays with observed frequencies in each category. The number of categories must be the same. Returns ------- chisquare statistic : float The chisquare test statistic p : float The p-value of the test. Notes ----- If the number of observation is the same across the two samples, then the number of degrees of freedom is equal to the number of bins minus one (due to the additional constraint on the sample size), else it is equal to the number of bins. The same observations on the size of the sample in the one-way chi squared test (see scipy.stats.chisquare) apply also for the case with two samples. Examples -------- >>> chisq2sam(np.ones(10), np.ones(10)) # same frequencies (0.0, 1.0) >>> chi2, pval = chisq2sam([100,0, 0], [0, 0, 100]) >>> print chi2 200.0 >>> print pval 2.08848758376e-45 """ if len(f_obs1) != len(f_obs2): raise ValueError('expecting same number of bins') f_obs1, f_obs_2 = np.asarray(f_obs1, dtype=int), np.asarray(f_obs2, dtype=int) s1, s2 = np.sum(f_obs1), np.sum(f_obs2) if s1 == s2: ksntrns = 1 else: ksntrns = 0 idx = ( f_obs1 + f_obs2 ) == 0. ksntrns += np.sum(idx.astype(int)) ddof = len(f_obs1) - ksntrns ratio1, ratio2 = map(np.sqrt, [ s2 / s1, s1 / s2 ] ) chisq = (( f_obs1 * ratio1 ) - ( f_obs2 * ratio2 ))**2 / ( f_obs1 + f_obs2 ) chisq = np.sum(chisq[~idx]) return chisq, chisqprob(chisq, ddof)
def Fisher_combination_Pvals(pvalues_array): pvalues_array = np.array(pvalues_array) z=0 for pval in pvalues_array: if pval > 1.e-20: z += -2*np.log(pval) else: z += -2*np.log(1.e-20) k = len(pvalues_array) combined_Pval = chisqprob(z,2*k) return combined_Pval