def _find_repeats(arr): """Find repeats in the array and return (repeats, repeat_count) """ arr = sort(arr) lastval = arr[0] howmany = 0 ind = 1 N = len(arr) repeat = 0 replist = [] repnum = [] while ind < N: if arr[ind] != lastval: if repeat: repnum.append(howmany+1) repeat = 0 howmany = 0 else: howmany += 1 repeat = 1 if (howmany == 1): replist.append(arr[ind]) lastval = arr[ind] ind += 1 if repeat: repnum.append(howmany+1) return replist, repnum
def probplot(x, sparams=(), dist='norm', fit=1, plot=None): """Return (osm, osr){,(scale,loc,r)} where (osm, osr) are order statistic medians and ordered response data respectively so that plot(osm, osr) is a probability plot. If fit==1, then do a regression fit and compute the slope (scale), intercept (loc), and correlation coefficient (r), of the best straight line through the points. If fit==0, only (osm, osr) is returned. sparams is a tuple of shape parameter arguments for the distribution. """ N = len(x) Ui = zeros(N)*1.0 Ui[-1] = 0.5**(1.0/N) Ui[0] = 1-Ui[-1] i = arange(2,N) Ui[1:-1] = (i-0.3175)/(N+0.365) try: ppf_func = eval('distributions.%s.ppf'%dist) except AttributError: raise dist, "is not a valid distribution with a ppf." if sparams is None: sparams = () if isscalar(sparams): sparams = (sparams,) if not isinstance(sparams,types.TupleType): sparams = tuple(sparams) res = inspect.getargspec(ppf_func) if not ('loc' == res[0][-2] and 'scale' == res[0][-1] and \ 0.0==res[-1][-2] and 1.0==res[-1][-1]): raise ValueError, "Function has does not have default location", \ "and scale parameters\n that are 0.0 and 1.0 respectively." if (len(sparams) < len(res[0])-len(res[-1])-1) or \ (len(sparams) > len(res[0])-3): raise ValueError, "Incorrect number of shape parameters." osm = ppf_func(Ui,*sparams) osr = sort(x) if fit or (plot is not None): # perform a linear fit. slope, intercept, r, prob, sterrest = stats.linregress(osm,osr) if plot is not None: try: import scipy.xplt as xplt xplt.limits() except: pass plot.plot(osm, osr, 'o', osm, slope*osm + intercept) plot.title('Probability Plot') plot.xlabel('Order Statistic Medians') plot.ylabel('Ordered Values') try: plot.expand_limits(5) except: pass xmin,xmax= amin(osm),amax(osm) ymin,ymax= amin(x),amax(x) pos = xmin+0.70*(xmax-xmin), ymin+0.01*(ymax-ymin) try: plot.addtext("r^2^=%1.4f" % r, xy=pos,tosys=1) except: pass if fit: return (osm, osr), (slope, intercept, r) else: return osm, osr
def shapiro(x,a=None,reta=0): """Shapiro and Wilk test for normality. Given random variates x, compute the W statistic and its p-value for a normality test. If p-value is high, one cannot reject the null hypothesis of normality with this test. P-value is probability that the W statistic is as low as it is if the samples are actually from a normal distribution. Output: W statistic and its p-value if reta is nonzero then also return the computed "a" values as the third output. If these are known for a given size they can be given as input instead of computed internally. """ N = len(x) if N < 3: raise ValueError, "Data must be at least length 3." if a is None: a = zeros(N,'f') init = 0 else: assert(len(a) == N/2), "a must be == len(x)/2" init = 1 y = sort(x) a,w,pw,ifault = statlib.swilk(y,a[:N/2],init) if not ifault in [0,2]: print ifault if N > 5000: print "p-value may not be accurate for N > 5000." if reta: return w, pw, a else: return w, pw
def ppcc_max(x, brack=(0.0,1.0), dist='tukeylambda'): """Returns the shape parameter that maximizes the probability plot correlation coefficient for the given data to a one-parameter family of distributions. See also ppcc_plot """ try: ppf_func = eval('distributions.%s.ppf'%dist) except AttributError: raise dist, "is not a valid distribution with a ppf." res = inspect.getargspec(ppf_func) if not ('loc' == res[0][-2] and 'scale' == res[0][-1] and \ 0.0==res[-1][-2] and 1.0==res[-1][-1]): raise ValueError, "Function has does not have default location", \ "and scale parameters\n that are 0.0 and 1.0 respectively." if (1 < len(res[0])-len(res[-1])-1) or \ (1 > len(res[0])-3): raise ValueError, "Must be a one-parameter family." N = len(x) # compute uniform median statistics Ui = zeros(N)*1.0 Ui[-1] = 0.5**(1.0/N) Ui[0] = 1-Ui[-1] i = arange(2,N) Ui[1:-1] = (i-0.3175)/(N+0.365) osr = sort(x) # this function computes the x-axis values of the probability plot # and computes a linear regression (including the correlation) # and returns 1-r so that a minimization function maximizes the # correlation def tempfunc(shape, mi, yvals, func): xvals = func(mi, shape) r, prob = stats.pearsonr(xvals, yvals) return 1-r return optimize.brent(tempfunc, brack=brack, args=(Ui, osr, ppf_func))
def anderson(x,dist='norm'): """Anderson and Darling test for normal, exponential, or Gumbel (Extreme Value Type I) distribution. Given samples x, return A2, the Anderson-Darling statistic, the significance levels in percentages, and the corresponding critical values. Critical values provided are for the following significance levels norm/expon: 15%, 10%, 5%, 2.5%, 1% Gumbel: 25%, 10%, 5%, 2.5%, 1% logistic: 25%, 10%, 5%, 2.5%, 1%, 0.5% If A2 is larger than these critical values then for that significance level, the hypothesis that the data come from a normal (exponential) can be rejected. """ if not dist in ['norm','expon','gumbel','extreme1','logistic']: raise ValueError, "Invalid distribution." y = sort(x) xbar = stats.mean(x) N = len(y) if dist == 'norm': s = stats.std(x) w = (y-xbar)/s z = distributions.norm.cdf(w) sig = array([15,10,5,2.5,1]) critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N),3) elif dist == 'expon': w = y / xbar z = distributions.expon.cdf(w) sig = array([15,10,5,2.5,1]) critical = around(_Avals_expon / (1.0 + 0.6/N),3) elif dist == 'logistic': def rootfunc(ab,xj,N): a,b = ab tmp = (xj-a)/b tmp2 = exp(tmp) val = [sum(1.0/(1+tmp2))-0.5*N, sum(tmp*(1.0-tmp2)/(1+tmp2))+N] return array(val) sol0=array([xbar,stats.std(x)]) sol = optimize.fsolve(rootfunc,sol0,args=(x,N),xtol=1e-5) w = (y-sol[0])/sol[1] z = distributions.logistic.cdf(w) sig = array([25,10,5,2.5,1,0.5]) critical = around(_Avals_logistic / (1.0+0.25/N),3) else: def fixedsolve(th,xj,N): val = stats.sum(xj)*1.0/N tmp = exp(-xj/th) term = sum(xj*tmp) term /= sum(tmp) return val - term s = optimize.fixed_point(fixedsolve, 1.0, args=(x,N),xtol=1e-5) xbar = -s*log(sum(exp(-x/s))*1.0/N) w = (y-xbar)/s z = distributions.gumbel_l.cdf(w) sig = array([25,10,5,2.5,1]) critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)),3) i = arange(1,N+1) S = sum((2*i-1.0)/N*(log(z)+log(1-z[::-1]))) A2 = -N-S return A2, critical, sig
def tempfunc(lmbda, xvals, samps): y = boxcox(samps,lmbda) yvals = sort(y) r, prob = stats.pearsonr(xvals, yvals) return 1-r