def rootfunc(ab,xj,N): a,b = ab tmp = (xj-a)/b tmp2 = exp(tmp) val = [sum(1.0/(1+tmp2))-0.5*N, sum(tmp*(1.0-tmp2)/(1+tmp2))+N] return array(val)
def wilcoxon(x,y=None): """ Calculates the Wilcoxon signed-rank test for the null hypothesis that two samples come from the same distribution. A non-parametric T-test. (need N > 20) Returns: t-statistic, two-tailed p-value """ if y is None: d = x else: x, y = map(asarray, (x, y)) if len(x) <> len(y): raise ValueError, 'Unequal N in wilcoxon. Aborting.' d = x-y d = compress(not_equal(d,0),d) # Keep all non-zero differences count = len(d) if (count < 10): print "Warning: sample size too small for normal approximation." r = stats.rankdata(abs(d)) r_plus = sum((d > 0)*r) r_minus = sum((d < 0)*r) T = min(r_plus, r_minus) mn = count*(count+1.0)*0.25 se = math.sqrt(count*(count+1)*(2*count+1.0)/24) if (len(r) != len(unique(r))): # handle ties in data replist, repnum = find_repeats(r) corr = 0.0 for i in range(len(replist)): si = repnum[i] corr += 0.5*si*(si*si-1.0) V = se*se - corr se = sqrt((count*V - T*T)/(count-1.0)) z = (T - mn)/se prob = 2*(1.0 -stats.zprob(abs(z))) return T, prob
def boxcox_llf(lmb, data): """The boxcox log-likelihood function. """ N = len(data) y = boxcox(data,lmb) my = stats.mean(y) f = (lmb-1)*sum(log(data)) f -= N/2.0*log(sum((y-my)**2.0/N)) return f
def norm(x, ord=2): """ norm(x, ord=2) -> n Matrix and vector norm. Inputs: x -- a rank-1 (vector) or rank-2 (matrix) array ord -- the order of norm. Comments: For vectors ord can be any real number including Inf or -Inf. ord = Inf, computes the maximum of the magnitudes ord = -Inf, computes minimum of the magnitudes ord is finite, computes sum(abs(x)**ord)**(1.0/ord) For matrices ord can only be + or - 1, 2, Inf. ord = 2 computes the largest singular value ord = -2 computes the smallest singular value ord = 1 computes the largest column sum of absolute values ord = -1 computes the smallest column sum of absolute values ord = Inf computes the largest row sum of absolute values ord = -Inf computes the smallest row sum of absolute values ord = 'fro' computes the frobenius norm sqrt(sum(diag(X.H * X))) """ x = asarray_chkfinite(x) nd = len(x.shape) Inf = scipy_base.Inf if nd == 1: if ord == Inf: return scipy_base.amax(abs(x)) elif ord == -Inf: return scipy_base.amin(abs(x)) else: return scipy_base.sum(abs(x)**ord)**(1.0/ord) elif nd == 2: if ord == 2: return scipy_base.amax(decomp.svd(x,compute_uv=0)) elif ord == -2: return scipy_base.amin(decomp.svd(x,compute_uv=0)) elif ord == 1: return scipy_base.amax(scipy_base.sum(abs(x))) elif ord == Inf: return scipy_base.amax(scipy_base.sum(abs(x),axis=1)) elif ord == -1: return scipy_base.amin(scipy_base.sum(abs(x))) elif ord == -Inf: return scipy_base.amin(scipy_base.sum(abs(x),axis=1)) elif ord in ['fro','f']: val = real((conjugate(x)*x).flat) return sqrt(add.reduce(val)) else: raise ValueError, "Invalid norm order for matrices." else: raise ValueError, "Improper number of dimensions to norm."
def kstat(data,n=2): """Return the nth k-statistic (1<=n<=4 so far). The nth k-statistic is the unique symmetric unbiased estimator of the nth cumulant kappa_n """ if n>4 or n<1: raise ValueError, "k-statistics only supported for 1<=n<=4" n = int(n) S = zeros(n+1,'d') data = ravel(data) N = len(data) for k in range(1,n+1): S[k] = sum(data**k) if n==1: return S[1]*1.0/N elif n==2: return (N*S[2]-S[1]**2.0)/(N*(N-1.0)) elif n==3: return (2*S[1]**3 - 3*N*S[1]*S[2]+N*N*S[3]) / (N*(N-1.0)*(N-2.0)) elif n==4: return (-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 - \ 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) / \ (N*(N-1.0)*(N-2.0)*(N-3.0)) else: raise ValueError, "Should not be here."
def mood(x,y): """Determine if the scale parameter for two distributions with equal medians is the same using a Mood test. Specifically, compute the z statistic and the probability of error that the null hypothesis is true but rejected with the computed statistic as the critical value. One can reject the null hypothesis that the ratio of scale parameters is 1 if the returned probability of error is small (say < 0.05) """ n = len(x) m = len(y) xy = r_[x,y] N = m+n if (N < 3): raise ValueError, "Not enough observations." ranks = stats.rankdata(xy) Ri = ranks[:n] M = sum((Ri - (N+1.0)/2)**2) # Approx stat. mnM = n*(N*N-1.0)/12 varM = m*n*(N+1.0)*(N+2)*(N-2)/180 z = (M-mnM)/sqrt(varM) p = distributions.norm.cdf(z) pval = 2*min(p,1-p) return z, pval
def orth(A): """Return an orthonormal basis for the range of A using svd""" u,s,vh = svd(A) M,N = A.shape tol = max(M,N)*scipy_base.amax(s)*eps num = scipy_base.sum(s > tol) Q = u[:,:num] return Q
def lstsq(a, b, cond=None, overwrite_a=0, overwrite_b=0): """ lstsq(a, b, cond=None, overwrite_a=0, overwrite_b=0) -> x,resids,rank,s Return least-squares solution of a * x = b. Inputs: a -- An M x N matrix. b -- An M x nrhs matrix or M vector. cond -- Used to determine effective rank of a. Outputs: x -- The solution (N x nrhs matrix) to the minimization problem: 2-norm(| b - a * x |) -> min resids -- The residual sum-of-squares for the solution matrix x (only if M>N and rank==N). rank -- The effective rank of a. s -- Singular values of a in decreasing order. The condition number of a is abs(s[0]/s[-1]). """ a1, b1 = map(asarray_chkfinite,(a,b)) if len(a1.shape) != 2: raise ValueError, 'expected matrix' m,n = a1.shape if len(b1.shape)==2: nrhs = b1.shape[1] else: nrhs = 1 if m != b1.shape[0]: raise ValueError, 'incompatible dimensions' gelss, = get_lapack_funcs(('gelss',),(a1,b1)) if n>m: # need to extend b matrix as it will be filled with # a larger solution matrix b2 = zeros((n,nrhs),gelss.typecode) if len(b1.shape)==2: b2[:m,:] = b1 else: b2[:m,0] = b1 b1 = b2 overwrite_a = overwrite_a or (a1 is not a and not hasattr(a,'__array__')) overwrite_b = overwrite_b or (b1 is not b and not hasattr(b,'__array__')) if gelss.module_name[:7] == 'flapack': lwork = calc_lwork.gelss(gelss.prefix,m,n,nrhs)[1] v,x,s,rank,info = gelss(a1,b1,cond = cond, lwork = lwork, overwrite_a = overwrite_a, overwrite_b = overwrite_b) else: raise NotImplementedError,'calling gelss from %s' % (gelss.module_name) if info>0: raise LinAlgError, "SVD did not converge in Linear Least Squares" if info<0: raise ValueError,\ 'illegal value in %-th argument of internal gelss'%(-info) resids = asarray([],x.typecode()) if n<m: x1 = x[:n] if rank==n: resids = sum(x[n:]**2) x = x1 return x,resids,rank,s
def radon(arr,theta=None): if theta is None: theta = mgrid[0:180] s = zeros((arr.shape[1],len(theta)),'d') k = 0 for th in theta: im = imrotate(arr,-th) s[:,k] = sum(im,axis=0) k += 1 return s
def binom_test(x,n=None,p=0.5): """An exact (two-sided) test of the null hypothesis that the probability of success in a Bernoulli experiment is p. Inputs: x -- Number of successes (or a vector of length 2 giving the number of successes and number of failures respectively) n -- Number of trials (ignored if x has length 2) p -- Hypothesized probability of success Returns pval -- Probability that null test is rejected for this set of x and n even though it is true. """ x = atleast_1d(x) if len(x) == 2: n = x[1]+x[0] x = x[0] elif len(x) == 1: x = x[0] if n is None or n < x: raise ValueError, "n must be >= x" else: raise ValueError, "Incorrect length for x." if (p > 1.0) or (p < 0.0): raise ValueError, "p must be in range [0,1]" d = distributions.binom.pmf(x,n,p) rerr = 1+1e-7 if (x*1.0/n < p): i = arange(x+1,n+1) y = sum(distributions.binom.pmf(i,n,p) <= d*rerr) pval = distributions.binom.cdf(x,n,p) + distributions.binom.sf(n-y,n,p) else: i = arange(0,x) y = sum(distributions.binom.pmf(i,n,p) <= d*rerr) pval = distributions.binom.cdf(y-1,n,p) + distributions.binom.sf(x-1,n,p) return min(1.0,pval)
def bartlett(*args): """Perform Bartlett test with the null hypothesis that all input samples have equal variances. Inputs are sample vectors: bartlett(x,y,z,...) Outputs: (T, pval) T -- the Test statistic pval -- significance level if null is rejected with this value of T (prob. that null is true but rejected with this p-value.) Sensitive to departures from normality. The Levene test is an alternative that is less sensitive to departures from normality. References: http://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm Snedecor, George W. and Cochran, William G. (1989), Statistical Methods, Eighth Edition, Iowa State University Press. """ k = len(args) if k < 2: raise ValueError, "Must enter at least two input sample vectors." Ni = zeros(k) ssq = zeros(k,'d') for j in range(k): Ni[j] = len(args[j]) ssq[j] = stats.var(args[j]) Ntot = sum(Ni) spsq = sum((Ni-1)*ssq)/(1.0*(Ntot-k)) numer = (Ntot*1.0-k)*log(spsq) - sum((Ni-1.0)*log(ssq)) denom = 1.0 + (1.0/(3*(k-1)))*((sum(1.0/(Ni-1.0)))-1.0/(Ntot-k)) T = numer / denom pval = distributions.chi2.sf(T,k-1) # 1 - cdf return T, pval
def oneway(*args,**kwds): """Test for equal means in two or more samples from the normal distribution. If the keyword parameter <equal_var> is true then the variances are assumed to be equal, otherwise they are not assumed to be equal (default). Return test statistic and the p-value giving the probability of error if the null hypothesis (equal means) is rejected at this value. """ k = len(args) if k < 2: raise ValueError, "Must enter at least two input sample vectors." if 'equal_var' in kwds.keys(): if kwds['equal_var']: evar = 1 else: evar = 0 else: evar = 0 Ni = array([len(args[i]) for i in range(k)]) Mi = array([stats.mean(args[i]) for i in range(k)]) Vi = array([stats.var(args[i]) for i in range(k)]) Wi = Ni / Vi swi = sum(Wi) N = sum(Ni) my = sum(Mi*Ni)*1.0/N tmp = sum((1-Wi/swi)**2 / (Ni-1.0))/(k*k-1.0) if evar: F = ((sum(Ni*(Mi-my)**2) / (k-1.0)) / (sum((Ni-1.0)*Vi) / (N-k))) pval = distributions.f.sf(F,k-1,N-k) # 1-cdf else: m = sum(Wi*Mi)*1.0/swi F = sum(Wi*(Mi-m)**2) / ((k-1.0)*(1+2*(k-2)*tmp)) pval = distributions.f.sf(F,k-1.0,1.0/(3*tmp)) return F, pval
def anderson(x,dist='norm'): """Anderson and Darling test for normal, exponential, or Gumbel (Extreme Value Type I) distribution. Given samples x, return A2, the Anderson-Darling statistic, the significance levels in percentages, and the corresponding critical values. Critical values provided are for the following significance levels norm/expon: 15%, 10%, 5%, 2.5%, 1% Gumbel: 25%, 10%, 5%, 2.5%, 1% logistic: 25%, 10%, 5%, 2.5%, 1%, 0.5% If A2 is larger than these critical values then for that significance level, the hypothesis that the data come from a normal (exponential) can be rejected. """ if not dist in ['norm','expon','gumbel','extreme1','logistic']: raise ValueError, "Invalid distribution." y = sort(x) xbar = stats.mean(x) N = len(y) if dist == 'norm': s = stats.std(x) w = (y-xbar)/s z = distributions.norm.cdf(w) sig = array([15,10,5,2.5,1]) critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N),3) elif dist == 'expon': w = y / xbar z = distributions.expon.cdf(w) sig = array([15,10,5,2.5,1]) critical = around(_Avals_expon / (1.0 + 0.6/N),3) elif dist == 'logistic': def rootfunc(ab,xj,N): a,b = ab tmp = (xj-a)/b tmp2 = exp(tmp) val = [sum(1.0/(1+tmp2))-0.5*N, sum(tmp*(1.0-tmp2)/(1+tmp2))+N] return array(val) sol0=array([xbar,stats.std(x)]) sol = optimize.fsolve(rootfunc,sol0,args=(x,N),xtol=1e-5) w = (y-sol[0])/sol[1] z = distributions.logistic.cdf(w) sig = array([25,10,5,2.5,1,0.5]) critical = around(_Avals_logistic / (1.0+0.25/N),3) else: def fixedsolve(th,xj,N): val = stats.sum(xj)*1.0/N tmp = exp(-xj/th) term = sum(xj*tmp) term /= sum(tmp) return val - term s = optimize.fixed_point(fixedsolve, 1.0, args=(x,N),xtol=1e-5) xbar = -s*log(sum(exp(-x/s))*1.0/N) w = (y-xbar)/s z = distributions.gumbel_l.cdf(w) sig = array([25,10,5,2.5,1]) critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)),3) i = arange(1,N+1) S = sum((2*i-1.0)/N*(log(z)+log(1-z[::-1]))) A2 = -N-S return A2, critical, sig
def fixedsolve(th,xj,N): val = stats.sum(xj)*1.0/N tmp = exp(-xj/th) term = sum(xj*tmp) term /= sum(tmp) return val - term
def ansari(x,y): """Determine if the scale parameter for two distributions with equal medians is the same using the Ansari-Bradley statistic. Specifically, compute the AB statistic and the probability of error that the null hypothesis is true but rejected with the computed statistic as the critical value. One can reject the null hypothesis that the ratio of variances is 1 if returned probability of error is small (say < 0.05) """ x,y = asarray(x),asarray(y) n = len(x) m = len(y) if (m < 1): raise ValueError, "Not enough other observations." if (n < 1): raise ValueError, "Not enough test observations." N = m+n xy = r_[x,y] # combine rank = stats.rankdata(xy) symrank = amin(array((rank,N-rank+1)),0) AB = sum(symrank[:n]) uxy = unique(xy) repeats = (len(uxy) != len(xy)) exact = ((m<55) and (n<55) and not repeats) if repeats and ((m < 55) or (n < 55)): print "Ties preclude use of exact statistic." if exact: astart, a1, ifault = statlib.gscale(n,m) ind = AB-astart total = sum(a1) if ind < len(a1)/2.0: cind = int(ceil(ind)) if (ind == cind): pval = 2.0*sum(a1[:cind+1])/total else: pval = 2.0*sum(a1[:cind])/total else: find = int(floor(ind)) if (ind == floor(ind)): pval = 2.0*sum(a1[find:])/total else: pval = 2.0*sum(a1[find+1:])/total return AB, min(1.0,pval) # otherwise compute normal approximation if N % 2: # N odd mnAB = n*(N+1.0)**2 / 4.0 / N varAB = n*m*(N+1.0)*(3+N**2)/(48.0*N**2) else: mnAB = n*(N+2.0)/4.0 varAB = m*n*(N+2)*(N-2.0)/48/(N-1.0) if repeats: # adjust variance estimates # compute sum(tj * rj**2) fac = sum(symrank**2) if N % 2: # N odd varAB = m*n*(16*N*fac-(N+1)**4)/(16.0 * N**2 * (N-1)) else: # N even varAB = m*n*(16*fac-N*(N+2)**2)/(16.0 * N * (N-1)) z = (AB - mnAB)/sqrt(varAB) pval = (1-distributions.norm.cdf(abs(z)))*2.0 return AB, pval
def fligner(*args,**kwds): """Perform Levene test with the null hypothesis that all input samples have equal variances. Inputs are sample vectors: bartlett(x,y,z,...) One keyword input, center, can be used with values center = 'mean', center='median' (default), center='trimmed' Outputs: (Xsq, pval) Xsq -- the Test statistic pval -- significance level if null is rejected with this value of X (prob. that null is true but rejected with this p-value.) References: http://www.stat.psu.edu/~bgl/center/tr/TR993.ps Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample tests for scale. 'Journal of the American Statistical Association.' 71(353), 210-213. """ k = len(args) if k < 2: raise ValueError, "Must enter at least two input sample vectors." if 'center' in kwds.keys(): center = kwds['center'] else: center = 'median' if not center in ['mean','median','trimmed']: raise ValueError, "Keyword argument <center> must be 'mean', 'median'"\ + "or 'trimmed'." if center == 'median': func = stats.median elif center == 'mean': func = stats.mean else: func = stats.trim_mean Ni = asarray([len(args[j]) for j in range(k)]) Yci = asarray([func(args[j]) for j in range(k)]) Ntot = sum(Ni) # compute Zij's Zij = [abs(asarray(args[i])-Yci[i]) for i in range(k)] allZij = [] g = [0] for i in range(k): allZij.extend(list(Zij[i])) g.append(len(allZij)) a = distributions.norm.ppf(stats.rankdata(allZij)/(2*(Ntot+1.0)) + 0.5) # compute Aibar Aibar = _apply_func(a,g,sum) / Ni anbar = stats.mean(a) varsq = stats.var(a) Xsq = sum(Ni*(asarray(Aibar)-anbar)**2.0)/varsq pval = distributions.chi2.sf(Xsq,k-1) # 1 - cdf return Xsq, pval
def levene(*args,**kwds): """Perform Levene test with the null hypothesis that all input samples have equal variances. Inputs are sample vectors: bartlett(x,y,z,...) One keyword input, center, can be used with values center = 'mean', center='median' (default), center='trimmed' center='median' is recommended for skewed (non-normal) distributions center='mean' is recommended for symmetric, moderate-tailed, dist. center='trimmed' is recommended for heavy-tailed distributions. Outputs: (W, pval) W -- the Test statistic pval -- significance level if null is rejected with this value of W (prob. that null is true but rejected with this p-value.) References: http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm Levene, H. (1960). In Contributions to Probability and Statistics: Essays in Honor of Harold Hotelling, I. Olkin et al. eds., Stanford University Press, pp. 278-292. Brown, M. B. and Forsythe, A. B. (1974), Journal of the American Statistical Association, 69, 364-367 """ k = len(args) if k < 2: raise ValueError, "Must enter at least two input sample vectors." Ni = zeros(k) Yci = zeros(k,'d') if 'center' in kwds.keys(): center = kwds['center'] else: center = 'median' if not center in ['mean','median','trimmed']: raise ValueError, "Keyword argument <center> must be 'mean', 'median'"\ + "or 'trimmed'." if center == 'median': func = stats.median elif center == 'mean': func = stats.mean else: func = stats.trim_mean for j in range(k): Ni[j] = len(args[j]) Yci[j] = func(args[j]) Ntot = sum(Ni) # compute Zij's Zij = [None]*k for i in range(k): Zij[i] = abs(asarray(args[i])-Yci[i]) # compute Zbari Zbari = zeros(k,'d') Zbar = 0.0 for i in range(k): Zbari[i] = stats.mean(Zij[i]) Zbar += Zbari[i]*Ni[i] Zbar /= Ntot numer = (Ntot-k)*sum(Ni*(Zbari-Zbar)**2) # compute denom_variance dvar = 0.0 for i in range(k): dvar += sum((Zij[i]-Zbari[i])**2) denom = (k-1.0)*dvar W = numer / denom pval = distributions.f.sf(W,k-1,Ntot-k) # 1 - cdf return W, pval