def bayes_mvs(data,alpha=0.90): """Return Bayesian confidence intervals for the mean, var, and std. Assumes 1-d data all has same mean and variance and uses Jeffrey's prior for variance and std. alpha gives the probability that the returned interval contains the true parameter. Uses peak of conditional pdf as starting center. Returns (peak, (a, b)) for each of mean, variance and standard deviation. """ x = ravel(data) n = len(x) assert(n > 1) n = float(n) xbar = sb.add.reduce(x)/n C = sb.add.reduce(x*x)/n - xbar*xbar # fac = sqrt(C/(n-1)) tval = distributions.t.ppf((1+alpha)/2.0,n-1) delta = fac*tval ma = xbar - delta mb = xbar + delta mp = xbar # fac = n*C/2.0 peak = 2/(n+1.) a = (n-1)/2.0 F_peak = distributions.invgamma.cdf(peak,a) q1 = F_peak - alpha/2.0 q2 = F_peak + alpha/2.0 if (q1 < 0): # non-symmetric area q2 = alpha va = 0.0 else: va = fac*distributions.invgamma.ppf(q1,a) vb = fac*distributions.invgamma.ppf(q2,a) vp = peak*fac # fac = sqrt(fac) peak = sqrt(2./n) F_peak = distributions.gengamma.cdf(peak,a,-2) q1 = F_peak - alpha/2.0 q2 = F_peak + alpha/2.0 if (q1 < 0): q2 = alpha sta = 0.0 else: sta = fac*distributions.gengamma.ppf(q1,a,-2) stb = fac*distributions.gengamma.ppf(q2,a,-2) stp = peak*fac return (mp,(ma,mb)),(vp,(va,vb)),(stp,(sta,stb))
def pdf_moments(cnt): """Return the Gaussian expanded pdf function given the list of central moments (first one is mean). """ N = len(cnt) if N < 2: raise ValueError, "At least two moments must be given to" + \ "approximate the pdf." totp = poly1d(1) sig = sqrt(cnt[1]) mu = cnt[0] if N > 2: Dvals = _hermnorm(N+1) for k in range(3,N+1): # Find Ck Ck = 0.0 for n in range((k-3)/2): m = k-2*n if m % 2: # m is odd momdiff = cnt[m-1] else: momdiff = cnt[m-1] - sig*sig*scipy.factorial2(m-1) Ck += Dvals[k][m] / sig**m * momdiff # Add to totp totp = totp + Ck*Dvals[k] def thisfunc(x): xn = (x-mu)/sig return totp(xn)*exp(-xn*xn/2.0)/sqrt(2*pi)/sig return thisfunc
def wilcoxon(x,y=None): """ Calculates the Wilcoxon signed-rank test for the null hypothesis that two samples come from the same distribution. A non-parametric T-test. (need N > 20) Returns: t-statistic, two-tailed p-value """ if y is None: d = x else: x, y = map(asarray, (x, y)) if len(x) <> len(y): raise ValueError, 'Unequal N in wilcoxon. Aborting.' d = x-y d = compress(not_equal(d,0),d) # Keep all non-zero differences count = len(d) if (count < 10): print "Warning: sample size too small for normal approximation." r = stats.rankdata(abs(d)) r_plus = sum((d > 0)*r) r_minus = sum((d < 0)*r) T = min(r_plus, r_minus) mn = count*(count+1.0)*0.25 se = math.sqrt(count*(count+1)*(2*count+1.0)/24) if (len(r) != len(unique(r))): # handle ties in data replist, repnum = find_repeats(r) corr = 0.0 for i in range(len(replist)): si = repnum[i] corr += 0.5*si*(si*si-1.0) V = se*se - corr se = sqrt((count*V - T*T)/(count-1.0)) z = (T - mn)/se prob = 2*(1.0 -stats.zprob(abs(z))) return T, prob
def mood(x,y): """Determine if the scale parameter for two distributions with equal medians is the same using a Mood test. Specifically, compute the z statistic and the probability of error that the null hypothesis is true but rejected with the computed statistic as the critical value. One can reject the null hypothesis that the ratio of scale parameters is 1 if the returned probability of error is small (say < 0.05) """ n = len(x) m = len(y) xy = r_[x,y] N = m+n if (N < 3): raise ValueError, "Not enough observations." ranks = stats.rankdata(xy) Ri = ranks[:n] M = sum((Ri - (N+1.0)/2)**2) # Approx stat. mnM = n*(N*N-1.0)/12 varM = m*n*(N+1.0)*(N+2)*(N-2)/180 z = (M-mnM)/sqrt(varM) p = distributions.norm.cdf(z) pval = 2*min(p,1-p) return z, pval
def circstd(samples, high=2*pi, low=0): """Compute the circular standard deviation for samples assumed to be in the range [low to high] """ ang = (samples - low)*2*pi / (high-low) res = stats.mean(exp(1j*ang)) V = 1-abs(res) return ((high-low)/2.0/pi) * sqrt(V)
def factorial2(n,exact=0): """n!! = special.gamma(n/2+1)*2**((m+1)/2)/sqrt(pi) n odd = 2**(n) * n! n even If exact==0, then floating point precision is used, otherwise exact long integer is computed. Notes: - Array argument accepted only for exact=0 case. - If n<0, the return value is 0. """ if exact: if n < -1: return 0L if n <= 0: return 1L n = long(n) val = 1L k = n while (k > 0): val = val*k k -= 2 return val else: n = asarray(n) vals = zeros(n.shape,'d') cond1 = (n % 2) & (n >= -1) cond2 = (1-(n % 2)) & (n >= -1) oddn = extract(cond1,n) evenn = extract(cond2,n) nd2o = oddn / 2.0 nd2e = evenn / 2.0 insert(vals,cond1,special.gamma(nd2o+1)/sqrt(pi)*pow(2.0,nd2o+0.5)) insert(vals,cond2,special.gamma(nd2e+1) * pow(2.0,nd2e)) return vals
def norm(x, ord=2): """ norm(x, ord=2) -> n Matrix and vector norm. Inputs: x -- a rank-1 (vector) or rank-2 (matrix) array ord -- the order of norm. Comments: For vectors ord can be any real number including Inf or -Inf. ord = Inf, computes the maximum of the magnitudes ord = -Inf, computes minimum of the magnitudes ord is finite, computes sum(abs(x)**ord)**(1.0/ord) For matrices ord can only be + or - 1, 2, Inf. ord = 2 computes the largest singular value ord = -2 computes the smallest singular value ord = 1 computes the largest column sum of absolute values ord = -1 computes the smallest column sum of absolute values ord = Inf computes the largest row sum of absolute values ord = -Inf computes the smallest row sum of absolute values ord = 'fro' computes the frobenius norm sqrt(sum(diag(X.H * X))) """ x = asarray_chkfinite(x) nd = len(x.shape) Inf = scipy_base.Inf if nd == 1: if ord == Inf: return scipy_base.amax(abs(x)) elif ord == -Inf: return scipy_base.amin(abs(x)) else: return scipy_base.sum(abs(x)**ord)**(1.0/ord) elif nd == 2: if ord == 2: return scipy_base.amax(decomp.svd(x,compute_uv=0)) elif ord == -2: return scipy_base.amin(decomp.svd(x,compute_uv=0)) elif ord == 1: return scipy_base.amax(scipy_base.sum(abs(x))) elif ord == Inf: return scipy_base.amax(scipy_base.sum(abs(x),axis=1)) elif ord == -1: return scipy_base.amin(scipy_base.sum(abs(x))) elif ord == -Inf: return scipy_base.amin(scipy_base.sum(abs(x),axis=1)) elif ord in ['fro','f']: val = real((conjugate(x)*x).flat) return sqrt(add.reduce(val)) else: raise ValueError, "Invalid norm order for matrices." else: raise ValueError, "Improper number of dimensions to norm."
def pdf_fromgamma(g1,g2,g3=0.0,g4=None): if g4 is None: g4 = 3*g2*g2 sigsq = 1.0/g2 sig = sqrt(sigsq) mu = g1*sig**3.0 p12 = _hermnorm(13) for k in range(13): p12[k] = p12[k]/sig**k # Add all of the terms to polynomial totp = p12[0] - (g1/6.0*p12[3]) + \ (g2/24.0*p12[4] +g1*g1/72.0*p12[6]) - \ (g3/120.0*p12[5] + g1*g2/144.0*p12[7] + g1**3.0/1296.0*p12[9]) + \ (g4/720*p12[6] + (g2*g2/1152.0+g1*g3/720)*p12[8] + g1*g1*g2/1728.0*p12[10] + g1**4.0/31104.0*p12[12]) # Final normalization totp = totp / sqrt(2*pi)/sig def thefunc(x): xn = (x-mu)/sig return totp(xn)*exp(-xn*xn/2.0) return thefunc
def thisfunc(x): xn = (x-mu)/sig return totp(xn)*exp(-xn*xn/2.0)/sqrt(2*pi)/sig
def ansari(x,y): """Determine if the scale parameter for two distributions with equal medians is the same using the Ansari-Bradley statistic. Specifically, compute the AB statistic and the probability of error that the null hypothesis is true but rejected with the computed statistic as the critical value. One can reject the null hypothesis that the ratio of variances is 1 if returned probability of error is small (say < 0.05) """ x,y = asarray(x),asarray(y) n = len(x) m = len(y) if (m < 1): raise ValueError, "Not enough other observations." if (n < 1): raise ValueError, "Not enough test observations." N = m+n xy = r_[x,y] # combine rank = stats.rankdata(xy) symrank = amin(array((rank,N-rank+1)),0) AB = sum(symrank[:n]) uxy = unique(xy) repeats = (len(uxy) != len(xy)) exact = ((m<55) and (n<55) and not repeats) if repeats and ((m < 55) or (n < 55)): print "Ties preclude use of exact statistic." if exact: astart, a1, ifault = statlib.gscale(n,m) ind = AB-astart total = sum(a1) if ind < len(a1)/2.0: cind = int(ceil(ind)) if (ind == cind): pval = 2.0*sum(a1[:cind+1])/total else: pval = 2.0*sum(a1[:cind])/total else: find = int(floor(ind)) if (ind == floor(ind)): pval = 2.0*sum(a1[find:])/total else: pval = 2.0*sum(a1[find+1:])/total return AB, min(1.0,pval) # otherwise compute normal approximation if N % 2: # N odd mnAB = n*(N+1.0)**2 / 4.0 / N varAB = n*m*(N+1.0)*(3+N**2)/(48.0*N**2) else: mnAB = n*(N+2.0)/4.0 varAB = m*n*(N+2)*(N-2.0)/48/(N-1.0) if repeats: # adjust variance estimates # compute sum(tj * rj**2) fac = sum(symrank**2) if N % 2: # N odd varAB = m*n*(16*N*fac-(N+1)**4)/(16.0 * N**2 * (N-1)) else: # N even varAB = m*n*(16*fac-N*(N+2)**2)/(16.0 * N * (N-1)) z = (AB - mnAB)/sqrt(varAB) pval = (1-distributions.norm.cdf(abs(z)))*2.0 return AB, pval
def anderson(x,dist='norm'): """Anderson and Darling test for normal, exponential, or Gumbel (Extreme Value Type I) distribution. Given samples x, return A2, the Anderson-Darling statistic, the significance levels in percentages, and the corresponding critical values. Critical values provided are for the following significance levels norm/expon: 15%, 10%, 5%, 2.5%, 1% Gumbel: 25%, 10%, 5%, 2.5%, 1% logistic: 25%, 10%, 5%, 2.5%, 1%, 0.5% If A2 is larger than these critical values then for that significance level, the hypothesis that the data come from a normal (exponential) can be rejected. """ if not dist in ['norm','expon','gumbel','extreme1','logistic']: raise ValueError, "Invalid distribution." y = sort(x) xbar = stats.mean(x) N = len(y) if dist == 'norm': s = stats.std(x) w = (y-xbar)/s z = distributions.norm.cdf(w) sig = array([15,10,5,2.5,1]) critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N),3) elif dist == 'expon': w = y / xbar z = distributions.expon.cdf(w) sig = array([15,10,5,2.5,1]) critical = around(_Avals_expon / (1.0 + 0.6/N),3) elif dist == 'logistic': def rootfunc(ab,xj,N): a,b = ab tmp = (xj-a)/b tmp2 = exp(tmp) val = [sum(1.0/(1+tmp2))-0.5*N, sum(tmp*(1.0-tmp2)/(1+tmp2))+N] return array(val) sol0=array([xbar,stats.std(x)]) sol = optimize.fsolve(rootfunc,sol0,args=(x,N),xtol=1e-5) w = (y-sol[0])/sol[1] z = distributions.logistic.cdf(w) sig = array([25,10,5,2.5,1,0.5]) critical = around(_Avals_logistic / (1.0+0.25/N),3) else: def fixedsolve(th,xj,N): val = stats.sum(xj)*1.0/N tmp = exp(-xj/th) term = sum(xj*tmp) term /= sum(tmp) return val - term s = optimize.fixed_point(fixedsolve, 1.0, args=(x,N),xtol=1e-5) xbar = -s*log(sum(exp(-x/s))*1.0/N) w = (y-xbar)/s z = distributions.gumbel_l.cdf(w) sig = array([25,10,5,2.5,1]) critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)),3) i = arange(1,N+1) S = sum((2*i-1.0)/N*(log(z)+log(1-z[::-1]))) A2 = -N-S return A2, critical, sig