def test_mean(self, mu0, return_weights=False): """ Returns - 2 x log-likelihood ratio, p-value and weights for a hypothesis test of the mean. Parameters ---------- mu0 : float Mean value to be tested return_weights : bool If return_weights is True the funtion returns the weights of the observations under the null hypothesis. Default is False Returns ------- test_results : tuple The log-likelihood ratio and p-value of mu0 """ self.mu0 = mu0 endog = self.endog nobs = self.nobs eta_min = (1. - (1. / nobs)) / (self.mu0 - max(endog)) eta_max = (1. - (1. / nobs)) / (self.mu0 - min(endog)) eta_star = optimize.brentq(self._find_eta, eta_min, eta_max) new_weights = (1. / nobs) * 1. / (1. + eta_star * (endog - self.mu0)) llr = -2 * np.sum(np.log(nobs * new_weights)) if return_weights: return llr, chi2.sf(llr, 1), new_weights else: return llr, chi2.sf(llr, 1)
def do_TS_GMM(self, utu, utr, b, f): # Unrestricted case where intercepts are included alpha = b[0:self.N] gtu = self.set_g(utu, f) nmom = gtu.shape[0] d = self.set_d() m = int(ceil(1.2 * float(self.T) ** (1.0 / 3))) # int(floor(self.T**(1.0/4.0))) Su = self.set_S(m, gtu) SIGMAb = self.get_varb(d, Su) Sigmaalp = SIGMAb[0:self.N,0:self.N] if rank(Sigmaalp,tol=1e-9)<self.N: valu = dot(alpha.T,dot(pinv(Sigmaalp),alpha)) else: valu = dot(alpha.T,solve(Sigmaalp,alpha)) self.TS_GMM_pval_u = squeeze(chi2.sf(valu,self.N - self.k)) # Restricted case with no intercept included gtr = self.set_g(utr,f) Sr = self.set_S(m,gtr) gTr = reshape(mean(gtr,axis=1),(nmom,1)) if rank(Sr,tol=1e-9) < nmom: valr = self.T*dot(gTr.T,dot(pinv(Sr),gTr)) else: valr = self.T*dot(gTr.T,solve(Sr,gTr)) self.TS_GMM_pval_r = squeeze(chi2.sf(valr,self.N-self.k)) # GJ test gTu = reshape(mean(gtu,axis=1),(nmom,1)) val = self.T*(dot(gTr.T,solve(Su,gTr)) - dot(gTu.T,solve(Su,gTu))) self.TS_GMM_pval_3 = squeeze(chi2.sf(val,self.N-self.k)) return
def calChi2(self, x, y): """ Input x:变量[1D]array y:实际标签[1D]array return chi2Value, eptFre, pValue, dfreedom """ n = y.shape[0] xValues = np.unique(x) yValues = np.unique(y) #y的分布 PyValues = [sum(y==yvalue)/n for yvalue in yValues] #生成交叉表,实际分布表和期望分布表 realFre = np.zeros((len(xValues), len(yValues))) eptFre = np.copy(realFre) for xIdx, xvalue in enumerate(xValues): for yIdx,yvalue in enumerate(yValues): realFre[xIdx, yIdx] = sum((x==xvalue)&(y==yvalue)) eptFre[xIdx, yIdx] = sum(x==xvalue)*PyValues[yIdx] #计算卡方值矩阵、卡方值、自由度、p值 chi2Matrix = np.power((realFre-eptFre), 2)/(eptFre+1.0e-6) chi2Value = chi2Matrix.sum() dfreedom = (len(xValues)-1)*(len(yValues)-1) if dfreedom == 0: pValue = chi2.sf(chi2Value, dfreedom+1) else: pValue = chi2.sf(chi2Value, dfreedom) return round(chi2Value,4), round(pValue,4), dfreedom, eptFre
def p_value_calculate(X, y, is_intercept, X_null=None): # with null variable print("X.shape={}, y.shape={}".format(X.shape, y.shape)) lr_model = LogisticRegression(C=1e8, solver='lbfgs', max_iter=1000) lr_model.fit(X, y) alt_prob = lr_model.predict_proba(X) alt_log_likelihood = -log_loss(y, alt_prob, normalize=False) if is_intercept: # if we want the p-value of beta_0 null_prob = sum(y) / float(y.shape[0]) * \ np.ones(y.shape) null_log_likelihood = -log_loss(y, null_prob, normalize=False) df = 1 G = 2 * (alt_log_likelihood - null_log_likelihood) p_value = chi2.sf(G, df) else: # without null variable lr_model.fit(X_null, y) null_prob = lr_model.predict_proba(X_null)[:, 1] null_log_likelihood = -log_loss(y, null_prob, normalize=False) df = X.shape[1] - X_null.shape[1] G = 2 * (alt_log_likelihood - null_log_likelihood) p_value = chi2.sf(G, df) return p_value
def g_func(init_par, alpha, delta, plx_obs, mualpha_obs, mudelta_obs, vrad_obs, sigma_obs, sigma_vrad, ccoef, N): """ Estimate the g_func (exponent of the likelihood), which gives an estimate of the membership of each star to the moving group. Parameters: ------------ init_par - Set of initial values for: 1) All the parallaxes [mas]; 2) The cluster centroid velocity [vx_0, vy_0, vz_0] [km/s]; 3) The cluster velocity dispersion, sigma_v [km/s]; alpha, delta - Cluster member positions [rad]; plx_obs, mualpha_obs, mudelta_obs - observed values for parallaxes and proper motions [mas, mas/yr]; sigma_obs - observed errors for parallaxes and proper motions [mas, mas/yr]; ccoef - 3-dim array of correlation coefficients from the HIP catalogue; N - the number of stars; Returns: ----------- g_func - An array of values with the values of g_i(theta) for each star in the group, see eq. 19 in Lindegren+2000; """ L, g = ilike(init_par, alpha, delta, plx_obs, mualpha_obs, mudelta_obs, vrad_obs, sigma_obs, sigma_vrad, ccoef, N) p = np.zeros(N) for i in range(N): if np.isfinite(vrad_obs[i]): p[i] = chi2.sf(g[i],3) else: p[i] = chi2.sf(g[i],2) return p
def asymptotic_p_value(asimov_q, use_median_rather_than_asimov=False): if use_median_rather_than_asimov: median_q = ncx2.ppf(0.5, df=2, nc=max(0., asimov_q)) p_value = chi2.sf(median_q, df=2) else: p_value = chi2.sf(asimov_q, df=2) return p_value
def statistics(n, N, scale=1.1): m = [2, 4, 8, 16, 32, 64] chisq_arr = np.zeros(len(m)) fig6, ax6 = plt.subplots(figsize=(7,5)) for i in range(len(m)): k, avg_kprob, std_kprob = average(n, N, m[i], scale=scale) prob_theory = (k-m[i])*np.log10(m[i]) - (k - m[i] + 1)*np.log10((m[i] + 1)) prob_theory = 10**prob_theory chisq1 = chisqg(avg_kprob[:], prob_theory[:], sd=std_kprob[:]) chisq2 = chisqg(avg_kprob[1:-1], prob_theory[1:-1], sd=std_kprob[1:-1]) p_value1 = chi2.sf(chisq1, len(avg_kprob[:])-2) p_value2 = chi2.sf(chisq2, len(avg_kprob[1:-1])-2) chisq_arr[i] = chisq2/(len(avg_kprob[1:-1])-2) print('m = ', m[i], 'all points, chisq = ', chisq1, 'p value = ', p_value1) print('m = ', m[i], 'Sliced points, chisq = ', chisq2, 'p value = ', p_value2) ax6.plot(m, chisq_arr, 'o') ax6.set_xlabel('m') ax6.set_ylabel('$\chi^2/N_{dof}$') fig6.tight_layout()
def get_gwas(simulated_data, freq_a1, freq_b1): model_a = smf.ols("phenotype ~ snp_a_gen", data=simulated_data).fit() model_b = smf.ols("phenotype ~ snp_b_gen", data=simulated_data).fit() model = smf.ols('phenotype ~ snp_a_gen + snp_b_gen', data=simulated_data).fit() # print(model.summary()) gwas_dict = { "snp_num": [1, 2], "freq1": [freq_a1, freq_b1], "freq2": [1 - freq_a1, 1 - freq_b1], "beta": [model_a.params.snp_a_gen, model_b.params.snp_b_gen], "se": [model_a.bse.snp_a_gen, model_b.bse.snp_b_gen], "p": [ chi2.sf((model_a.params.snp_a_gen / model_a.bse.snp_a_gen)**2, 1), chi2.sf((model_b.params.snp_b_gen / model_b.bse.snp_b_gen)**2, 1) ] } gwas = pd.DataFrame.from_dict(gwas_dict) gwas = gwas[["snp_num", "freq1", "freq2", "beta", "se", "p"]] gwas["z_u"] = gwas["beta"] / gwas["se"] return gwas
def TS(pathway, ppi, stu, purb, dgv=0.4): """ T-square. For the given pathway, this function creates the corresponding interaction matrix. Returns the associated T^2, p-value, and other information. """ # - pathway is a pandas dataframe containing the id of a pathway, its included proteins, and their abundance ratios. # - z contains only the abundance ratios of the proteins, to be used later to calculate the T^2 value. # - m contains the indexes of the pathway's proteins that can be translated form Uniprot to STRING. # - S is the interaction matrix to be built from STRING interaction scores. pathway = pathway.sort_values(by='prot_id').reset_index(drop=True) z = np.vectorize(float)(pathway['exp']) m = np.where(np.isin(stu, pathway))[0] S = dgv * np.identity(len(z)) nrow, ncol = S.shape if nrow != 1: # Each possible pair of proteins in the pathway will be looked at. for i in range(1, nrow): for j in range(i): # x1 is the Uniprot accession to one protein i in the pathway. x1 = pathway.iat[i, 1] # x2 is the Uniprot accession to another protein j in the pathway. x2 = pathway.iat[j, 1] # s1 is the STRING id to protein i, translated using m. s1 = stu.iloc[m]['String_id'].to_numpy()[np.where( np.isin(stu.iloc[m]['Uniprot_id'], x1))[0]] # s2 is the STRING id to protein j. s2 = stu.iloc[m]['String_id'].to_numpy()[np.where( np.isin(stu.iloc[m]['Uniprot_id'], x2))[0]] if len(s1) * len(s2) != 0: # If there is one, p will contain the experimental value of interaction between the two proteins. # If there are more, the mean will be used. # Get all protein 1 partners in STRING p = ppi.iloc[np.where(np.isin(ppi['protein1'], s1))[0]] # Check for protein among partners p = p.iloc[np.where(np.isin(p['protein2'], s2))]['experimental'] # p holds interaction score(s) from experimental evidences only if len(p) > 0: # Modify S to include that value at the corresponding index. if z[pathway['prot_id'] == x1] * z[pathway['prot_id'] == x2] < 0: S[i, j] = -np.mean(p) S[j, i] = -np.mean(p) else: S[i, j] = np.mean(p) S[j, i] = np.mean(p) # Transform STRING scores, S matrix to be positive-definite, and therefore useable for the T^2 method. S = nearestPD(S) r = np.linalg.matrix_rank(S, tol=1e-10) # T2 score matrix effectively used T2 = TV(z, S) I = dgv * np.identity(len(z)) T2I = TV(z, I) return np.array([ pathway.iat[0, 0], ','.join(pathway['prot_id']), len(pathway), r, T2, chi2.sf(T2, r), T2I, chi2.sf(T2I, r) ], dtype=object)
def fmb_pval(self, alpham, vcvalpha): if rank(vcvalpha, tol=1e-9) < self.N: self.FMB_JS = dot(alpham.T, dot(pinv(vcvalpha), alpham)) return chi2.sf(dot(alpham.T, dot(pinv(vcvalpha), alpham)), self.N - self.k) else: self.FMB_JS = dot(alpham.T, solve(vcvalpha, alpham)) return chi2.sf(dot(alpham.T, solve(vcvalpha, alpham)), self.N - self.k)
def LR_test(full, reduced): full_ll = list(full.rx2('loglik')) reduced_ll = list(reduced.rx2('loglik')) assert full_ll[0] == reduced_ll[0] if len(full_ll) == 1: return 1. full_df = len(full.rx2('coefficients')) if len(reduced_ll) == 1: return chi2.sf(2*full_ll[1] - 2*full_ll[0], full_df) reduced_df = len(reduced.rx2('coefficients')) df = max(full_df - reduced_df, 1) return chi2.sf(2*full_ll[1] - 2*reduced_ll[1], df)
def fishers_method(log10pvals): if len(log10pvals) == 1: return log10pvals[0] signs = set(np.sign(log10pvals)) df = 2 * len(log10pvals) if len(signs) > 1: return 0 elif signs == {-1}: chi2sum = sum(-log10pvals) * np.log(10) return np.log10(chi2.sf(chi2sum, df)) elif signs == {1}: chi2sum = sum(log10pvals) * np.log(10) return -np.log10(chi2.sf(chi2sum, df))
def independence_test(self, A): length = len(A) if length==1: return [0.0,None] n = 0 ni = [] nj = [] for i in range(length): sum = 0 for ele in A[i]: n = n + ele sum = sum + ele ni.append(sum) for i in range(len(A[0])): sum = 0 for j in range(length): sum = sum + A[j][i] nj.append(sum) T = [] for i in ni: tmp = [] for j in nj: tmp.append(float(i*j)/n) T.append(tmp) c2 = 0 for i in range(length): for j in range(len(A[0])): c2 = c2 + (A[i][j]-T[i][j])**2/T[i][j] p = C.sf(c2, (length-1)*(len(A[0])-1)) return [round(c2,6),round(p,6)]
def getStats(self,keyData,keyTheory,auto=False,show=False,numbins=-1): dataVector = self.datas[keyData]['binned'][:numbins] dofs = len(dataVector) - 2 if auto: dofs -= 1 chisqNull = self.chisq(keyData) chisqTheory = self.chisq(keyData,keyTheory,numbins=numbins) stats = {} stats['reduced chisquare'] = chisqTheory / dofs stats['pte'] = chi2.sf(chisqTheory,dofs) stats['null sig'] = np.sqrt(chisqNull) stats['theory sig'] = np.sqrt(chisqNull-chisqTheory) if show: printC("="*len(keyTheory),color='y') printC(keyTheory,color='y') printC('-'*len(keyTheory),color='y') printC("amplitude",color='b') bf,err = self.datas[keyTheory]['amp'] printC('{0:.2f}'.format(bf)+"+-"+'{:04.2f}'.format(err),color='p') for key,val in stats.iteritems(): printC(key,color='b') printC('{0:.2f}'.format(val),color='p') printC("="*len(keyTheory),color='y') return stats
def chisq(obs, exp, cov_input, ddof=None, sidx=None, eidx=None): ''' compute chisq input obs : observation exp : expected value cov : covariance ddof : degree of freedom oupput: chisq: computed chisq p : p value ''' from scipy.stats import chi2 diff = obs - exp if not (exp == 0.).all() else obs.copy() cov = cov_input.copy() if sidx is None: sidx = 0 if eidx is None: eidx = len(diff) diff = diff[sidx:eidx] cov = cov[sidx:eidx, sidx:eidx] norm = np.mean(np.abs(cov)) cov /= norm diff /= np.sqrt(norm) chisq = np.dot(np.linalg.pinv(cov), diff) chisq = np.dot(diff.T, chisq) if ddof is None: ddof = len(obs) p = chi2.sf(chisq, ddof) return chisq, p
def get_chi2_two(key, targ, null, plot=True): """ """ ntype = targ.shape[0] for i in range(ntype): tt = np.array([ sum([i * p for i, p in enumerate(targ[i, j, :])]) for j in range(ntype) ]) nn = np.array([ sum([i * p for i, p in enumerate(null[i, j, :])]) for j in range(ntype) ]) n_nn = np.sum(nn) n_tt = np.sum(tt) k1 = np.sqrt(n_nn / n_tt) k2 = 1 / k1 chi2_stat = sum([(k1 * t - k2 * n)**2 / (t + n) for t, n in zip(tt, nn)]) df = len(tt) - 1 print(i, tt, nn / np.sum(nn) * np.sum(tt), np.array(tt) - np.array(nn) / np.sum(nn) * np.sum(tt)) #print(key, i, 'chi2', chi2_stat, df) p_value = chi2.sf(chi2_stat, df) print(k, i, 'p-value', p_value)
def chstwo(bins1, bins2, ddof=0, axis=0): """ Chi-square test for difference between two data sets. Return the statistic and the p-value. Uses _count() to drop from the chi-square sum any entries for which both values are 0; the degrees of freedom are decremented for each dropped case. Comments on relation to NRC's chstwo() and SciPy's chi2.sf(): -bins1 :: f_obs -bins2 :: f_exp -ddof :: adjustment to dof ... related to knstrn ... dof = num_obs - 1 - ddof ... see NRC discussion of arguments to chstwo algorithm. ... if data sets are of equal integral (perhaps normalized) then knstrn = 1, ddof = 0, dof = num_obs - 1 - 0 ... if data sets are not of equal integral then knstrn = 0 ... ddof = -1, dof = num_obs - 1 - (-1) = num_obs ... could essentially rewrite as dof = num_obs - 1 - ddof --> dof = num_obs - knstrn - ddof, where ddof becomes any adjustment to dof beyond that of knstrn -Evaluating the prob from the chi2 distribution: ... NRC defines gammq as 1 - P, where P is probability that the observed chi2 for a correct model should be less than a value chi2. gammq(0.5*df, 0.5*chi2) ... For scipy, we'd have gammq = 1 - scipy.stats.chi2.cdf(x, df, loc=0, scale=1) or gammq = scipy.stats.chi2.sf(x, df, loc=0, scale=1) ... sf is survival function (also defined as 1 - cdf, but sf is sometimes more accurate) """ # check inputs if len(bins1) != len(bins2): return 'Error: chstwo: len(bins1) != len(bins2)' # where bins1[i]=bins2[i]=0, mask entry i bins1, bins2 = np.ma.masked_where(condition = [(bins1 == bins2) & (bins1==0), (bins2 == bins1) & (bins2==0)], a = [bins1, bins2]) # Do the test terms = (bins1 - bins2)**2 / (bins1 + bins2) # Terms with division by zero have been masked out. Terms evaluating to zero are kept. stat = terms.sum(axis=axis) num_obs = _count(terms, axis=axis) # returns number of non-masked terms in stat ddof = np.asarray(ddof) p = chi2.sf(stat, num_obs - 1 - ddof) # print('chi2.sf(stat = %f, dof = %d, ddof = %d)' % (stat, num_obs - 1 - ddof, ddof)) return stat, p
def test_joint_skew_kurt(self, skew0, kurt0, return_weights=False): """ Returns - 2 x log-likelihood and the p-value for the joint hypothesis test for skewness and kurtosis Parameters ---------- skew0 : float Skewness value to be tested kurt0 : float Kurtosis value to be tested return_weights : bool If True, function also returns the weights that maximize the likelihood ratio. Default is False. Returns ------- test_results : tuple The log-likelihood ratio and p-value of the joint hypothesis test. """ self.skew0 = skew0 self.kurt0 = kurt0 start_nuisance = np.array([self.endog.mean(), self.endog.var()]) llr = optimize.fmin_powell(self._opt_skew_kurt, start_nuisance, full_output=1, disp=0)[1] p_val = chi2.sf(llr, 2) if return_weights: return llr, p_val, self.new_weights.T return llr, p_val
def testConnectednessBetweenTwoUsers(self, currentUser, neighborUser): """ Cluster identification: Test whether two user models have the same ground-truth theta :param currentUser: :param neighborUser: :return: """ n = currentUser.update_num m = neighborUser.update_num if n == 0 and m == 0: return False # Compute numerator theta_combine = np.dot( np.linalg.pinv(currentUser.A + neighborUser.A - 2 * self.lambda_ * np.identity(n=self.dimension)), currentUser.b + neighborUser.b) num = np.linalg.norm( np.dot(currentUser.X, (currentUser.UserThetaNoReg - theta_combine)))**2 + np.linalg.norm( np.dot( neighborUser.X, (neighborUser.UserThetaNoReg - theta_combine)))**2 XCombinedRank = np.linalg.matrix_rank( np.concatenate((currentUser.X, neighborUser.X), axis=0)) df1 = int(currentUser.rank + neighborUser.rank - XCombinedRank) chiSquareStatistic = num / (self.NoiseScale**2) p_value = chi2.sf(x=chiSquareStatistic, df=df1) if p_value <= self.neighbor_identification_alpha: # upper bound probability of false alarm return False else: return True
def test_var(self, sig2_0, return_weights=False): """ Returns -2 x log-likelihoog ratio and the p-value for the hypothesized variance Parameters ---------- sig2_0 : float Hypothesized variance to be tested return_weights : bool If True, returns the weights that maximize the likelihood of observing sig2_0. Default is False Returns -------- test_results : tuple The log-likelihood ratio and the p_value of sig2_0 Examples -------- >>> random_numbers = np.random.standard_normal(1000)*100 >>> el_analysis = sm.emplike.DescStat(random_numbers) >>> hyp_test = el_analysis.test_var(9500) """ self.sig2_0 = sig2_0 mu_max = max(self.endog) mu_min = min(self.endog) llr = optimize.fminbound(self._opt_var, mu_min, mu_max, \ full_output=1)[1] p_val = chi2.sf(llr, 1) if return_weights: return llr, p_val, self.new_weights.T else: return llr, p_val
def __init__(self,sigLocal,sig0,N0): # Convert significance to p-value pLocal = norm.sf(sigLocal) p0 = norm.sf(sig0) # Get the test statistic value corresponding to the p-value u = chi2.isf(pLocal*2,1) u0 = chi2.isf(p0*2,1) # The main equations N = N0 * exp(-(u-u0)/2.) pGlobal = N + chi2.sf(u,1)/2. # Further info sigGlobal = norm.isf(pGlobal) trialFactor = pGlobal/pLocal self.sigGlobal = sigGlobal self.sigLocal = sigLocal self.sig0 = sig0 self.pGlobal = pGlobal self.pLocal = pLocal self.p0 = p0 self.N0 = N0 self.N = N self.u0 = u0 self.u = u self.trialFactor = trialFactor
def lr_test(self, ll_min, ll_max, p_threshold, delta_params): """Performs likelihood ratio test. Parameters ---------- ll_min : float Log likelihood of model with fewer params. ll_max Log likelihood of model with more params. p_threshold : float Threshold of p value to accept model_max as better. delta_params : int Difference in number of parameters in nested model. Returns ------- bool True if test passes. """ lr = -2 * (ll_max - ll_min) # log-likelihood ratio p = chi2.sf(lr, delta_params) print(ll_min, ll_max, delta_params) print("p-value is: " + str(p)) return p < p_threshold
def logrank(fr_sample1, fr_sample2): confidence = 0.95 Z = norm.ppf((1.00 + confidence) / 2.0) fr1 = fr_agg(fr_sample1) fr2 = fr_agg(fr_sample2) fr1 = fr1.set_index('Time') fr2 = fr2.set_index('Time') idx = fr1.index.union(fr2.index) Y1 = fr1['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0) Y2 = fr2['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0) h1 = fr1['dE[N]'].reindex(idx, fill_value=0) h2 = fr2['dE[N]'].reindex(idx, fill_value=0) dN1 = fr1['dN'].reindex(idx, fill_value=0) dN2 = fr2['dN'].reindex(idx, fill_value=0) w1 = (Y1 * Y2 / (Y1 + Y2)).fillna(0) w2 = ((Y1 * Y2) / (Y1 + Y2)**2).fillna(0) U_score = (w1 * (h1 - h2)).sum() U_var = (w2 * (dN1 + dN2)).sum() score = U_score**2 / U_var p_value = chi2.sf(score, 1) #print U_score, U_var, score, p_value return p_value
def _opt_var(self, nuisance_mu, pval=False): """ This is the function to be optimized over a nuisance mean parameter to determine the likelihood ratio for the variance Parameters ---------- nuisance_mu : float Value of a nuisance mean parameter Returns ------- llr : float Log likelihood of a pre-specified variance holding the nuisance parameter constant """ endog = self.endog nobs = self.nobs sig_data = ((endog - nuisance_mu) ** 2 \ - self.sig2_0) mu_data = (endog - nuisance_mu) est_vect = np.column_stack((mu_data, sig_data)) eta_star = self._modif_newton(np.array([1. / nobs, 1. / nobs]), est_vect, np.ones(nobs) * (1. / nobs)) denom = 1 + np.dot(eta_star, est_vect.T) self.new_weights = 1. / nobs * 1. / denom llr = np.sum(np.log(nobs * self.new_weights)) if pval: # Used for contour plotting return chi2.sf(-2 * llr, 1) return -2 * llr
def multinomial_chi2_test(self, config): '''Simple multinomial chi2 test - Based on Gardner & Knopoff (1974)''' # Config should include 'K' marker - divide catalogue into intervals # of K length if config['K'] < 1: raise ValueError('K must be greater than or equal to 1') start_bin = range(self.start_year, self.end_year, config['K']) end_bin = range(self.start_year + config['K'] - 1, self.end_year, config['K']) number_ints = len(end_bin) time_ints = np.column_stack([np.array(start_bin[:number_ints]), np.array(end_bin)]) ncount = np.zeros(np.shape(time_ints)[0], dtype=int) for iloc, time_bin in enumerate(time_ints): ncount[iloc] = np.sum(np.logical_and(self.year >= time_bin[0], self.year < time_bin[1])) #ncount = ncount.astype(float) theoretical_rate = self.number_events / float(config['K']) c_value, expected_c = self._get_c_value(config['K'], theoretical_rate) observed_c = self._get_obs_c(c_value, ncount) chi2m = np.sum((observed_c.astype(float) - (expected_c.astype(float) ** 2.)) / expected_c.astype(float)) if not(config['dof']): config['dof'] = float(c_value[-1] - 2) p_value = chi2.sf(chi2m, config['dof']) return p_value, chi2m, c_value[-1], config['dof']
def test_corr(self, corr0, return_weights=0): """ Returns -2 x log-likelihood ratio and p-value for the correlation coefficient between 2 variables Parameters ---------- corr0 : float Hypothesized value to be tested return_weights : bool If true, returns the weights that maximize the log-likelihood at the hypothesized value """ nobs = self.nobs endog = self.endog if endog.shape[1] != 2: raise Exception('Correlation matrix not yet implemented') nuis0 = np.array([ endog[:, 0].mean(), endog[:, 0].var(), endog[:, 1].mean(), endog[:, 1].var() ]) x0 = np.zeros(5) weights0 = np.array([1. / nobs] * int(nobs)) args = (corr0, endog, nobs, x0, weights0) llr = optimize.fmin(self._opt_correl, nuis0, args=args, full_output=1, disp=0)[1] p_val = chi2.sf(llr, 1) if return_weights: return llr, p_val, self.new_weights.T return llr, p_val
def mcfequal(fr_sample1, fr_sample2, confidence=0.95, robust=False): #TODO: drop Y = 0, compare multiple Z = norm.ppf((1.00 + confidence) / 2.0) fr1 = fr_agg(fr_sample1) fr2 = fr_agg(fr_sample2) fr1 = fr1.set_index('Time') fr2 = fr2.set_index('Time') idx = fr1.index.union(fr2.index) Y1 = fr1['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0) Y2 = fr2['dY'].reindex(idx, fill_value=0).cumsum().shift(1).fillna(0) h1 = fr1['dE[N]'].reindex(idx, fill_value=0) h2 = fr2['dE[N]'].reindex(idx, fill_value=0) w = (Y1 * Y2 / (Y1 + Y2)).fillna(0) U_score = (w * (h1 - h2)).sum() if not robust: U_var = var(fr1, w).iloc[-1] + var( fr2, w).iloc[-1] #(w**2 * (h1/Y1 + h2/Y2)).sum() else: U_var = robust_var(fr_sample1, w).iloc[-1] + robust_var(fr_sample2, w).iloc[-1] p_value = chi2.sf(U_score**2 / U_var, 1) #print U_score, U_var, U_score**2/U_var, p_value return p_value
def visualize_pruning( w_norm, n_retained, title='Initial model weights vs theoretical for pruning'): fig, ax1 = plt.subplots() ax1.set_title(title) ax1.hist(w_norm, normed=True, bins=200, alpha=0.6, histtype='stepfilled', range=[0, n_retained * 5]) ax1.axvline(x=n_retained, linewidth=1, color='r') ax1.set_ylabel('PDF', color='b') ax2 = ax1.twinx() ax2.set_ylabel('Survival Function', color='r') ax1.set_xlabel('w_norm') x = np.linspace(chi2.ppf(0.001, n_retained), chi2.ppf(0.999, n_retained), 100) ax2.plot(x, chi2.sf(x, n_retained), 'g-', lw=1, alpha=0.6, label='chi2 pdf') ax1.plot(x, chi2.pdf(x, n_retained), 'r-', lw=1, alpha=0.6, label='chi2 pdf')
def solve(self): url='http://112.124.1.3:8060/getData/101.json' data=urllib2.urlopen(url).read() babyArr=eval(data)['data'] baby=[] num=[] for i in babyArr: if i[2]<=10 and i[2]>5: baby.append(i[2]*4.33) num.append(i[5]) if i[2]<49 and i[2]>25: baby.append(i[2]) num.append(i[5]) a1,a2,a3,n1,n2,n3=0.0,0.0,0.0,0.0,0.0,0.0 for i in range(len(baby)): if baby[i]<=37: a1+=1 if num[i]==1: n1+=1 elif baby[i]>=41: a3+=1 if num[i]==1: n3+=1 else: a2+=1 if num[i]==1: n2+=1 a=a1+a2+a3 p1=a1/a p2=a2/a p3=a3/a n=n1+n2+n3 c=n1**2/(n*p1)+n2**2/(n*p2)+n3**2/(n*p3)-n p=chi2.sf(c,2) print[c,p]
def llr_pvalue(self): """ p-value of likelihood ratio chi-squared statistic; `-2*(llnull - llf)` with degrees of freedom `df_model` under H0: all coefficients excluding constnat is zero """ return chi2.sf(self.llr, self.df_model)
def normaltest(vdf: vDataFrame, column: str): """ --------------------------------------------------------------------------- Test whether a sample differs from a normal distribution. Parameters ---------- vdf: vDataFrame input vDataFrame. column: str Input vcolumn to test. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ Z1, Z2 = skewtest(vdf, column)["value"][0], kurtosistest(vdf, column)["value"][0] Z = Z1**2 + Z2**2 pvalue = chi2.sf(Z, 2) result = tablesample({ "index": [ "Statistic", "p_value", ], "value": [Z, pvalue], }) return result
def get_s2_two_old(key, targ, null, plot=True): """ """ ntype = targ.shape[0] for i in range(ntype): tt = np.array([ sum([i * p for i, p in enumerate(targ[i, j, :])]) for j in range(ntype) ]) nn = np.array([ sum([i * p for i, p in enumerate(null[i, j, :])]) for j in range(ntype) ]) n_nn = np.sum(nn) n_tt = np.sum(tt) print(i, tt, nn / np.sum(nn) * np.sum(tt), np.array(tt) - np.array(nn) / np.sum(nn) * np.sum(tt)) nn = nn / n_nn tt = tt / n_tt cb = sum([np.sqrt(t * n) for t, n in zip(tt, nn)]) s2 = 4 * n_nn * n_tt / (n_nn + n_tt) * np.arccos(cb)**2 df = len(tt) - 1 p_value = chi2.sf(s2, df) print(key, i, 'p-value', p_value)
def logrank_k(*fr_samples): confidence = 0.95 Z = norm.ppf((1.00 + confidence) / 2.0) k = len(fr_samples) cohorts = range(k) fr_aggs = [fr_agg(fr_cohort).set_index('Time') for fr_cohort in fr_samples] dY = pd.DataFrame( {idx: fr_agg['dY'] for idx, fr_agg in zip(cohorts, fr_aggs)}).fillna(0) dN = pd.DataFrame( {idx: fr_agg['dN'] for idx, fr_agg in zip(cohorts, fr_aggs)}).fillna(0) dN_ = dN.sum(axis=1) Y = dY.cumsum(axis=0).shift(1, axis=0).fillna(0) Y_ = Y.sum(axis=1) K = Y.all(axis=1).astype(int) Z = np.array([(K * (dN[cohort] - Y[cohort] * (dN_ / Y_))).sum() for cohort in cohorts]) V = np.array([[ (K * Y[cohort_i] / Y_ * (int(cohort_i == cohort_j) - Y[cohort_j] / Y_) * dN_).sum() for cohort_j in cohorts ] for cohort_i in cohorts]) Z = Z[:-1] V = V[:-1, :-1] score = np.dot(np.dot(Z, inv(V)), Z) p_value = chi2.sf(score, k - 1) #print Z, V, score, p_value return p_value
def check_ra_dec_uniform(ra, dec, nside=2, footprint=None): pixels = hp.ang2pix(nside, ra, dec, lonlat=True) npix = hp.nside2npix(nside) if footprint is None else footprint.size pixels, counts = np.unique(pixels, return_counts=True) assert pixels.size == npix mean = ra.size / npix assert chi2.sf(((counts - mean)**2.0 / mean).sum(), df=npix-1) > 1e-5
def Question_2_Chi_Squared(): ''' Observed A nA Bitter 13 11 24 NBitter 5 7 12 18 18 36 Expected-0 A nA Bitter 24/36 * 18/36 * 36 24 NBitter 12 18 18 36 Expected-1 A nA Bitter 12 24 NBitter 12 18 18 36 Expected-2 A nA Bitter 12 12 24 NBitter 6 6 12 18 18 36 ''' chi2_sum = (13-12)**2/12 + (11-12)**2/12 + (5-6)**2/6 + (7-6)**2/6 print(chi2.sf(0.5, 1))
def pValue(confidence_levels, statistics="chi2"): if statistics is None: print("no test statistics specified. choose between ks or chi2") exit() elif statistics == "ks": """ The KS statistics is relevant for the actual confidence regions """ return kstest(confidence_levels, 'uniform')[1] elif statistics == "chi2": """ The Fisher method chi2 statistics acts on the p-values, defined as 1-CL """ #compute the fisher combined statistic p = -2. * np.sum(np.log(confidence_levels)) # find out if this is chi-squared distributed with 2*NExp degrees of freedom using the survival function. #print "sf: ", chi2.sf(p,2.*len(confidence_levels)) sfValue = chi2.sf(p, 2. * len(confidence_levels)) """ print "the confidence levels are: \n", confidence_levels print "this gives a global pvalue of ", p print "the survival function for this is ", sfValue x = np.arange(0.0, 50, 0.01) plt.plot(x, chi2.sf(x,2*len(confidence_levels))) plt.axvline(p) plt.show() """ return sfValue
def plot_ts_vs_chi2(data, ext_list="ext1_ts", ndf_chi2=[1], subplot=[1, 2, 1], **kwargs): ax = plt.subplot(subplot[0], subplot[1], subplot[2]) ext_data = column(data, "%s" % ext_list) clean_data = [x for x in ext_data if not math.isnan(x)] # remove nan from data n, bins, patches = plt.hist( clean_data, int(math.ceil(max(column(data, "%s" % ext_list)))), normed=1, facecolor="green" ) bincenters = 0.5 * (bins[1:] + bins[:-1]) chi2_vals = [] colors = ["r", "b", "g"] for j in range(0, len(ndf_chi2)): chi2_vals.append(chi2.pdf(bincenters, ndf_chi2[j])) plt.plot(bincenters, chi2_vals[j], "%s--" % colors[j], linewidth=2.0, label="$\chi^2_%i$/2" % ndf_chi2[j]) legend = ax.legend(loc="upper right", frameon=False) plt.ylabel("PDF") plt.xlabel("TS$_{%s}$" % ext_list[0:4]) plt.yscale("log") plt.ylim([0.00001, 2.0]) ax = plt.subplot(subplot[0], subplot[1], subplot[2] + 1) n, bins, patches = plt.hist( clean_data, int(math.ceil(max(column(data, "%s" % ext_list)))), normed=1, facecolor="green", cumulative=-1 ) chi2_sfvals = [] for j in range(0, len(ndf_chi2)): chi2_sfvals.append(chi2.sf(bincenters, ndf_chi2[j])) plt.plot(bincenters, chi2_sfvals[j], "%s--" % colors[j], linewidth=2.0, label="$\chi^2_%i$/2" % ndf_chi2[j]) legend = ax.legend(loc="upper right", frameon=False) plt.ylabel("1-CDF") plt.xlabel("TS$_{%s}$" % ext_list[0:4]) plt.yscale("log") plt.ylim([0.00001, 2.0])
def test_corr(self, corr0, return_weights=0): """ Returns -2 x log-likelihood ratio and p-value for the correlation coefficient between 2 variables Parameters ---------- corr0 : float Hypothesized value to be tested return_weights : bool If true, returns the weights that maximize the log-likelihood at the hypothesized value """ nobs = self.nobs endog = self.endog if endog.shape[1] != 2: raise Exception('Correlation matrix not yet implemented') nuis0 = np.array([endog[:, 0].mean(), endog[:, 0].var(), endog[:, 1].mean(), endog[:, 1].var()]) x0 = np.zeros(5) weights0 = np.array([1. / nobs] * int(nobs)) args = (corr0, endog, nobs, x0, weights0) llr = optimize.fmin(self._opt_correl, nuis0, args=args, full_output=1, disp=0)[1] p_val = chi2.sf(llr, 1) if return_weights: return llr, p_val, self.new_weights.T return llr, p_val
def independence_test(self, A): if(len(A)==0): return [None.None] if(len(A)==1): return [0.0,None] rows=[] columns=[] #ini rows for row in A: r_sum=0.0 for r in row: r_sum+=r rows.append(r_sum) #ini columns for i in range(0,len(A[0])): c_sum=0.0 for row in A: c_sum+=row[i] columns.append(c_sum) tot=sum(rows) #cal Ma1 Ma_1=[] for i in range(0,len(rows)): row=[] for j in range(0,len(columns)): T=rows[i]*columns[j]/tot row.append(T) Ma_1.append(row) #cal Ma2 Ma_2=[] for i in range(0,len(rows)): row=[] for j in range(0,len(columns)): Z=(A[i][j]-Ma_1[i][j])**2/Ma_1[i][j] row.append(Z) Ma_2.append(row) #cal X_2 X_2=0.0 for x in Ma_2: for y in x: X_2+=y #cal c print X_2 print tot c=(X_2/(X_2+tot))**0.5 print c #cal p p=chi2.sf(x=X_2,df=(len(rows)-1)*(len(columns)-1)) return [round(X_2,6),round(p,6)]
def var_threshold(self, alpha): SS = (self.n1 - 1) * self.S1 chi20 = SS / self.var0 n1 = self.n1 # hypothesis testing2 H1a = chi2.ppf(1 - alpha / 2.0, n1 - 1) < chi20 or chi2.ppf(alpha / 2.0, n1 - 1) > chi20 H1b = chi2.ppf(alpha / 2.0, n1 - 1) > chi20 H1c = chi2.ppf(1 - alpha / 2.0, n1 - 1) < chi20 # p-value p1a = np.max(np.array([chi2.sf(chi20, n1 - 1), 1 - chi2.sf(chi20, n1 - 1)])) p1b = chi2.sf(chi20, n1 - 1) p1c = 1 - chi2.sf(chi20, n1 - 1) # confidence intervals: the minimum level of significance # alpha for which the null hypothesis is rejected c1 = (n1 - 1) * SS / chi2.ppf(1 - alpha / 2.0, n1 - 1) c2 = (n1 - 1) * SS / chi2.ppf(alpha / 2.0, n1 - 1) return H1a, H1b, H1c, p1a, p1b, p1c, (c1, c2)
def p_z_norm(est, se): '''Convert estimate and se to Z-score and P-value.''' try: Z = est / se except (FloatingPointError, ZeroDivisionError): Z = float('inf') P = chi2.sf(Z ** 2, 1, loc=0, scale=1) # 0 if Z=inf return P, Z
def chi2_from_sample(data): histdata = np.histogram(data, 56, range=(-7, 7)) N = sum(histdata[0]) y_data = histdata[0] yerr2 = y_data * (1 - y_data / N) x = histdata[1][:-1] + np.diff(histdata[1]) / 2 norm025 = norm(0, 2.5) y_fit = N * (norm025.cdf(histdata[1][1:]) - norm025.cdf(histdata[1][:-1])) chi2_m = sum((y_data - y_fit)**2 / yerr2) return chi2_m, chi2.sf(chi2_m, len(x)), len(x)
def sf_z2m(ts,m=2): """ Return the survival function (chance probability) according to the asymptotic calibration for the Z^2_m test. args ---- ts result of the Z^2_m test """ from scipy.stats import chi2 return chi2.sf(ts,2*m)
def hotel2(X1, X2): """ Computes Hotelling t-squared statistic under two assumptions or variance. :param X1 pandas DataFrame with samples from first group :param X2 pandas DataFrame with samples from second group :return None """ # TODO: Verify Hotelling results n1, k = X1.shape n2, k2 = X2.shape assert(k == k2) ybar1 = X1.mean().as_matrix() s1 = np.cov(X1, rowvar=False) ybar2 = X2.mean(axis=0).as_matrix() s2 = np.cov(X2, rowvar=False) alpha = 0.05 diffs = (ybar1 - ybar2).reshape(1, k) # TODO: Incorporate a test for equal variances # If variances assumed equal, then pool if True: spool = ((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2) t2 = diffs\ .dot(np.linalg.inv(spool * (1.0 / n1 + 1.0 / n2)))\ .dot(ybar1 - ybar2)\ .item(0) eff = (n1 + n2 - k - 1) * t2 / (k * (n1 + n2 - 2)) df1 = k df2 = n1 + n2 - k - 1 p_value = f.sf(eff, df1, df2) print('If variances are assumed equal between classes') if p_value < alpha: print("\t=> Reject the null hypothesis that mean(X1) == mean(X2)") else: print("\t=> Accept null hypothesis that mean(X1) == mean(X2)") print(t2, p_value) # If variances not assumed equal, then use modified Hotelling if True: t2 = diffs\ .dot(np.linalg.inv(s1 / n1 + s2 / n2))\ .dot(ybar1 - ybar2)\ .item(0) p_value = chi2.sf(t2, k) print('If variances are not assumed equal between classes') if p_value < alpha: print("\t=> Reject the null hypothesis that mean(X1) == mean(X2)") else: print("\t=> Accept null hypothesis that mean(X1) == mean(X2)") print(t2, p_value)
def pvalues(self): """Association p-value for candidate markers.""" self.compute_statistics() lml_alts = self.alt_lmls() lml_null = self.null_lml() lrs = -2 * lml_null + 2 * asarray(lml_alts) from scipy.stats import chi2 chi2 = chi2(df=1) return chi2.sf(lrs)
def chi_test(alleles, freq_table, est, n=1): # Combinations: https://docs.python.org/2/library/itertools.html # Calculate Observed and the Expected then calc the chi stat allele_pool = ''.join(alleles) individuals = Counter(est) observed_table = pd.DataFrame() expected_table = pd.DataFrame() for genotype in combinations_with_replacement(allele_pool, 2): genotype_string = genotype[0] + genotype[1] # EXPECTED if genotype[0] != genotype[1]: # These are the Heteros, multiple their expected frequencies by 2 exp_hetero_count = 2 * freq_table[genotype[0]].values * freq_table[genotype[1]].values * n expected_table[genotype_string] = exp_hetero_count else: # These are the Homos, square their expected frequencies exp_homo_count = (freq_table[genotype[0]].values ** 2) * n expected_table[genotype_string] = exp_homo_count # OBSERVED if genotype[0] == genotype[1]: # H**o therefore order doesn't matter obs_homo_count = individuals[int(genotype_string)] observed_table.set_value(0, genotype_string, obs_homo_count) else: # Hetero therefore order does matter # I'm using extended slice syntax to reverse the string [::-1] This makes synonymous genotypes the same obs_hetero_count = individuals[int(genotype_string)] obs_hetero_count += individuals[int(genotype_string[::-1])] observed_table.set_value(0, genotype_string, obs_hetero_count) # Calculate Chi sq chi_table = ((observed_table - expected_table) ** 2) / expected_table chi_sq_statistic = chi_table.sum(axis=1).values[0] df = len(expected_table.columns) - 2 p_value = chi2.sf(chi_sq_statistic, df) print('\n Expected Numbers') print(expected_table) print('\n Observed Numbers') print(observed_table) print('\n CHI^2: {}'.format(chi_sq_statistic)) print('df: {}, p: {}'.format(df, p_value)) return chi_sq_statistic, df, p_value
def fit(self): if self.verbose: print 'rho\tbeta1\tbeta2\tsigma2' self.em() self.residue1 = self.Y1 - self.Y1.mean() - self.X * self.beta1 self.residue2 = self.Y2 - self.Y2.mean() - self.X * self.beta2 self.sigma1 = np.sqrt(np.dot(self.residue1.T, self.residue1)/(self.n - 2)) self.sigma2 = np.sqrt(np.dot(self.residue2.T,self.residue2)/(self.n-2)) self.LL = -self.n * np.log(4*np.pi**2*self.sigma2*self.sigma1*(1-self.rho**2)) - 1/2/(1-self.rho**2)*(np.dot(self.residue1.T,self.residue1)/self.sigma1**2 - 2 * self.rho * np.dot(self.residue1.T,self.residue2) / self.sigma1 / self.sigma2 + np.dot(self.residue2.T,self.residue2) / self.sigma2 ** 2) self.pvalue = chi2.sf(2*(self.LL - self.LL_control),2) self.beta2_norm = float(self.beta2 / self.sigma2) self.pvalue = 1 - norm.cdf(self.beta2_norm * np.sqrt(self.n)) if self.verbose: print self.beta2_norm, self.pvalue
def solve(self): page=urllib.urlopen('http://112.124.1.3:8060/getData/101.json') c=page.read() data=json.loads(c)["data"] week1=0.0 week1_2=0.0 week2=0.0 week2_2=0.0 week3=0.0 week3_2=0.0 for x in data: y=x[2] if(y>5 and y<=10): y=y*4.33 if(y>25 and y<=37): if(x[5]==1): week1+=1 else: week1_2+=1 elif(y>=38 and y<=40): if(x[5]==1): week2+=1 else: week2_2+=1 elif(y>=41 and y<49): if(x[5]==1): week3+=1 else: week3_2+=1 week=[week1,week2,week3] week_2=[week1_2,week2_2,week3_2] sum_week=sum(week) sum_week_2=sum(week_2) tot=sum_week+sum_week_2 t11=(week1+week1_2)*sum_week/tot t12=(week2+week2_2)*sum_week/tot t13=(week3+week3_2)*sum_week/tot z1=(week1-t11)**2/t11 z2=(week2-t12)**2/t12 z3=(week3-t13)**2/t13 x_2=z1+z2+z3 p=chi2.sf(x=x_2,df=2) return (round(x_2,6),p)
def el_test(self, b0_vals, param_nums, method='nm', stochastic_exog=1, return_weights=0): """ Returns the llr and p-value for a hypothesized parameter value for a regression that goes through the origin Parameters ---------- b0_vals : 1darray The hypothesized value to be tested param_num : 1darray Which parameters to test. Note this uses python indexing but the '0' parameter refers to the intercept term, which is assumed 0. Therefore, param_num should be > 0. print_weights : bool If true, returns the weights that optimize the likelihood ratio at b0_vals. Default is False method : string Can either be 'nm' for Nelder-Mead or 'powell' for Powell. The optimization method that optimizes over nuisance parameters. Default is 'nm' stochastic_exog : bool When TRUE, the exogenous variables are assumed to be stochastic. When the regressors are nonstochastic, moment conditions are placed on the exogenous variables. Confidence intervals for stochastic regressors are at least as large as non-stochastic regressors. Default is TRUE Returns ------- res : tuple pvalue and likelihood ratio """ b0_vals = np.hstack((0, b0_vals)) param_nums = np.hstack((0, param_nums)) test_res = self.model.fit().el_test(b0_vals, param_nums, method=method, stochastic_exog=stochastic_exog, return_weights=return_weights) llr_test = test_res[0] llr_res = llr_test - self.llr pval = chi2.sf(llr_res, self.model.exog.shape[1] - 1) if return_weights: return llr_res, pval, test_res[2] else: return llr_res, pval
def fishers_method(values): """ function to combine p values, using Fisher's method Args: x: list of P-values for a gene Returns: combined P-value """ values = [ x for x in values if not isnan(x) ] # use Fisher's combined method to estimate the P value from multiple # P-values. The chi square statistic is -2*sum(ln(P-values)) return chi2.sf(-2 * sum(map(log, values)), 2 * len(values))
def fishersMethod(x): """ function to combine p values, using Fisher's method Args: x: list of P-values for a gene Returns: combined P-value """ x = [ val for val in x if not math.isnan(val) ] if len(x) == 0: return numpy.nan return chi2.sf(-2 * sum(numpy.log(x)), 2 * len(x))
def mahalanobis_distance(difference, num_random_features): num_samples, _ = shape(difference) sigma = cov(transpose(difference)) try: numpy.linalg.inv(sigma) except LinAlgError: warn('covariance matrix is singular. Pvalue returned is 1.1') return 1.1 mu = mean(difference, 0) if num_random_features == 1: stat = float(num_samples * mu ** 2) / float(sigma) else: stat = num_samples * mu.dot(solve(sigma, transpose(mu))) return chi2.sf(stat, num_random_features)
def solve(self): html = self.getHtml('http://112.124.1.3:8050/getData/101') data = json.loads(html)["data"] T1 = [] T2 = [] for i in range(len(data)): a = data[i][2] if ((a<=5) | (a>=49) | ((a>10)&(a<=25))): continue if ((a<=10)&(a>5)): a = 4.33*a if (data[i][5] == 1): T1.append(a) T2.append(a) n1 = len(T1) n2 = len(T2) a1=0 b1=0 d1=0 for i in T1: if (i < 38): a1 = a1 + 1 elif i>=41: d1 = d1 +1 else: b1 = b1 + 1 a2=0 b2=0 d2=0 for i in T2: if (i < 38): a2 = a2 + 1 elif i>=41: d2 = d2 +1 else: b2 = b2 + 1 t1 = float(a2)*n1/n2 t2 = float(b2)*n1/n2 c2 = float(a1**2)/(n1*float(a2)/n2)+float(b1**2)/(n1*float(b2)/n2)+float(d1**2)/(n1*float(d2)/n2)-n1 p = C.sf(c2,2) return [c2,p]
def mahalanobis_distance(difference, num_random_features): num_samples, _ = np.shape(difference) sigma = np.cov(np.transpose(difference)) mu = np.mean(difference, 0) if num_random_features == 1: stat = float(num_samples * mu ** 2) / float(sigma) else: try: linalg.inv(sigma) except LinAlgError: print('covariance matrix is singular. Pvalue returned is 1.1') warnings.warn('covariance matrix is singular. Pvalue returned is 1.1') return 0 stat = num_samples * mu.dot(linalg.solve(sigma, np.transpose(mu))) return chi2.sf(stat, num_random_features)
def mv_test_mean(self, mu_array, return_weights=False): """ Returns -2 x log likelihood and the p-value for a multivariate hypothesis test of the mean Parameters ---------- mu_array : 1d array Hypothesized values for the mean. Must have same number of elements as columns in endog return_weights : bool If True, returns the weights that maximize the likelihood of mu_array. Default is False. Returns ------- test_results : tuple The log-likelihood ratio and p-value for mu_array """ endog = self.endog nobs = self.nobs if len(mu_array) != endog.shape[1]: raise Exception( "mu_array must have the same number of \ elements as the columns of the data." ) mu_array = mu_array.reshape(1, endog.shape[1]) means = np.ones((endog.shape[0], endog.shape[1])) means = mu_array * means est_vect = endog - means start_vals = 1.0 / nobs * np.ones(endog.shape[1]) eta_star = self._modif_newton(start_vals, est_vect, np.ones(nobs) * (1.0 / nobs)) denom = 1 + np.dot(eta_star, est_vect.T) self.new_weights = 1 / nobs * 1 / denom llr = -2 * np.sum(np.log(nobs * self.new_weights)) p_val = chi2.sf(llr, mu_array.shape[1]) if return_weights: return llr, p_val, self.new_weights.T else: return llr, p_val
def chi2_calc(flux,fluxerr): ''' Chi2 with constant flux model flux: flux array fluxerr: flux error array return: chi^2 with constant flux (at weighted mean) model ''' we_fix=[] for item in fluxerr: w_fix=1/((item)**2) we_fix.append(w_fix) wei_fix=np.array(we_fix) dof_fix=len(flux)-1 wm_fix=np.average(flux,weights=wei_fix) un_fix=1/np.sqrt((np.array(we_fix).sum())) residual_fix=errf(wm_fix,flux,fluxerr) chisquared_fix=residual_fix**2 chi_tot_fix=((residual_fix**2).sum()) null_hyp_fix=chi2.sf(chi_tot_fix,(np.array(flux).shape[0])-1) return(chi_tot_fix,dof_fix,wm_fix,un_fix,null_hyp_fix)