def rootfunc(ab,xj,N):
     a,b = ab
     tmp = (xj-a)/b
     tmp2 = exp(tmp)
     val = [sum(1.0/(1+tmp2))-0.5*N,
            sum(tmp*(1.0-tmp2)/(1+tmp2))+N]
     return array(val)
def wilcoxon(x,y=None):
    """
Calculates the Wilcoxon signed-rank test for the null hypothesis that two samples come from the same distribution. A non-parametric T-test. (need N > 20)

Returns: t-statistic, two-tailed p-value
"""
    if y is None:
        d = x
    else:
        x, y = map(asarray, (x, y))
        if len(x) <> len(y):
            raise ValueError, 'Unequal N in wilcoxon.  Aborting.'
        d = x-y
    d = compress(not_equal(d,0),d) # Keep all non-zero differences
    count = len(d)
    if (count < 10):
        print "Warning: sample size too small for normal approximation."
    r = stats.rankdata(abs(d))
    r_plus = sum((d > 0)*r)
    r_minus = sum((d < 0)*r)
    T = min(r_plus, r_minus)
    mn = count*(count+1.0)*0.25
    se = math.sqrt(count*(count+1)*(2*count+1.0)/24)
    if (len(r) != len(unique(r))):  # handle ties in data
        replist, repnum = find_repeats(r)
        corr = 0.0
        for i in range(len(replist)):
            si = repnum[i]
            corr += 0.5*si*(si*si-1.0)
        V = se*se - corr
        se = sqrt((count*V - T*T)/(count-1.0))
    z = (T - mn)/se
    prob = 2*(1.0 -stats.zprob(abs(z)))
    return T, prob
def boxcox_llf(lmb, data):
    """The boxcox log-likelihood function.
    """
    N = len(data)
    y = boxcox(data,lmb)
    my = stats.mean(y)
    f = (lmb-1)*sum(log(data))
    f -= N/2.0*log(sum((y-my)**2.0/N))
    return f
Beispiel #4
0
def norm(x, ord=2):
    """ norm(x, ord=2) -> n

    Matrix and vector norm.

    Inputs:

      x -- a rank-1 (vector) or rank-2 (matrix) array
      ord -- the order of norm.

     Comments:

       For vectors ord can be any real number including Inf or -Inf.
         ord = Inf, computes the maximum of the magnitudes
         ord = -Inf, computes minimum of the magnitudes
         ord is finite, computes sum(abs(x)**ord)**(1.0/ord)

       For matrices ord can only be + or - 1, 2, Inf.
         ord = 2 computes the largest singular value
         ord = -2 computes the smallest singular value
         ord = 1 computes the largest column sum of absolute values
         ord = -1 computes the smallest column sum of absolute values
         ord = Inf computes the largest row sum of absolute values
         ord = -Inf computes the smallest row sum of absolute values
         ord = 'fro' computes the frobenius norm sqrt(sum(diag(X.H * X)))
    """
    x = asarray_chkfinite(x)
    nd = len(x.shape)
    Inf = scipy_base.Inf
    if nd == 1:
        if ord == Inf:
            return scipy_base.amax(abs(x))
        elif ord == -Inf:
            return scipy_base.amin(abs(x))
        else:
            return scipy_base.sum(abs(x)**ord)**(1.0/ord)
    elif nd == 2:
        if ord == 2:
            return scipy_base.amax(decomp.svd(x,compute_uv=0))
        elif ord == -2:
            return scipy_base.amin(decomp.svd(x,compute_uv=0))
        elif ord == 1:
            return scipy_base.amax(scipy_base.sum(abs(x)))
        elif ord == Inf:
            return scipy_base.amax(scipy_base.sum(abs(x),axis=1))
        elif ord == -1:
            return scipy_base.amin(scipy_base.sum(abs(x)))
        elif ord == -Inf:
            return scipy_base.amin(scipy_base.sum(abs(x),axis=1))
        elif ord in ['fro','f']:
            val = real((conjugate(x)*x).flat)
            return sqrt(add.reduce(val))
        else:
            raise ValueError, "Invalid norm order for matrices."
    else:
        raise ValueError, "Improper number of dimensions to norm."
def kstat(data,n=2):
    """Return the nth k-statistic (1<=n<=4 so far).

    The nth k-statistic is the unique symmetric unbiased estimator of the nth
    cumulant kappa_n
    """
    if n>4 or n<1:
        raise ValueError, "k-statistics only supported for 1<=n<=4"
    n = int(n)
    S = zeros(n+1,'d')
    data = ravel(data)
    N = len(data)
    for k in range(1,n+1):
        S[k] = sum(data**k)
    if n==1:
        return S[1]*1.0/N
    elif n==2:
        return (N*S[2]-S[1]**2.0)/(N*(N-1.0))
    elif n==3:
        return (2*S[1]**3 - 3*N*S[1]*S[2]+N*N*S[3]) / (N*(N-1.0)*(N-2.0))
    elif n==4:
        return (-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 - \
                4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) / \
                (N*(N-1.0)*(N-2.0)*(N-3.0))
    else:
        raise ValueError, "Should not be here."
def mood(x,y):
    """Determine if the scale parameter for two distributions with equal
    medians is the same using a Mood test.

    Specifically, compute the z statistic and the probability of error
    that the null hypothesis is true but rejected with the computed
    statistic as the critical value.

    One can reject the null hypothesis that the ratio of scale parameters is
    1 if the returned probability of error is small (say < 0.05)
    """
    n = len(x)
    m = len(y)
    xy = r_[x,y]
    N = m+n
    if (N < 3):
        raise ValueError, "Not enough observations."
    ranks = stats.rankdata(xy)
    Ri = ranks[:n]
    M = sum((Ri - (N+1.0)/2)**2)
    # Approx stat.
    mnM = n*(N*N-1.0)/12
    varM = m*n*(N+1.0)*(N+2)*(N-2)/180
    z = (M-mnM)/sqrt(varM)
    p = distributions.norm.cdf(z)
    pval = 2*min(p,1-p)
    return z, pval
Beispiel #7
0
def orth(A):
    """Return an orthonormal basis for the range of A using svd"""
    u,s,vh = svd(A)
    M,N = A.shape
    tol = max(M,N)*scipy_base.amax(s)*eps
    num = scipy_base.sum(s > tol)
    Q = u[:,:num]
    return Q
Beispiel #8
0
def lstsq(a, b, cond=None, overwrite_a=0, overwrite_b=0):
    """ lstsq(a, b, cond=None, overwrite_a=0, overwrite_b=0) -> x,resids,rank,s

    Return least-squares solution of a * x = b.

    Inputs:

      a -- An M x N matrix.
      b -- An M x nrhs matrix or M vector.
      cond -- Used to determine effective rank of a.

    Outputs:

      x -- The solution (N x nrhs matrix) to the minimization problem:
                  2-norm(| b - a * x |) -> min
      resids -- The residual sum-of-squares for the solution matrix x
                (only if M>N and rank==N).
      rank -- The effective rank of a.
      s -- Singular values of a in decreasing order. The condition number
           of a is abs(s[0]/s[-1]).
    """
    a1, b1 = map(asarray_chkfinite,(a,b))
    if len(a1.shape) != 2:
        raise ValueError, 'expected matrix'
    m,n = a1.shape
    if len(b1.shape)==2: nrhs = b1.shape[1]
    else: nrhs = 1
    if m != b1.shape[0]:
        raise ValueError, 'incompatible dimensions'
    gelss, = get_lapack_funcs(('gelss',),(a1,b1))
    if n>m:
        # need to extend b matrix as it will be filled with
        # a larger solution matrix
        b2 = zeros((n,nrhs),gelss.typecode)
        if len(b1.shape)==2: b2[:m,:] = b1
        else: b2[:m,0] = b1
        b1 = b2
    overwrite_a = overwrite_a or (a1 is not a and not hasattr(a,'__array__'))
    overwrite_b = overwrite_b or (b1 is not b and not hasattr(b,'__array__'))
    if gelss.module_name[:7] == 'flapack':
        lwork = calc_lwork.gelss(gelss.prefix,m,n,nrhs)[1]
        v,x,s,rank,info = gelss(a1,b1,cond = cond,
                                lwork = lwork,
                                overwrite_a = overwrite_a,
                                overwrite_b = overwrite_b)
    else:
        raise NotImplementedError,'calling gelss from %s' % (gelss.module_name)
    if info>0: raise LinAlgError, "SVD did not converge in Linear Least Squares"
    if info<0: raise ValueError,\
       'illegal value in %-th argument of internal gelss'%(-info)
    resids = asarray([],x.typecode())
    if n<m:
        x1 = x[:n]
        if rank==n: resids = sum(x[n:]**2)
        x = x1
    return x,resids,rank,s
Beispiel #9
0
def radon(arr,theta=None):
    if theta is None:
        theta = mgrid[0:180]
    s = zeros((arr.shape[1],len(theta)),'d')
    k = 0
    for th in theta:
        im = imrotate(arr,-th)
        s[:,k] = sum(im,axis=0)
        k += 1
    return s
def binom_test(x,n=None,p=0.5):
    """An exact (two-sided) test of the null hypothesis that the
    probability of success in a Bernoulli experiment is p.

    Inputs:

       x -- Number of successes (or a vector of length 2 giving the
              number of successes and number of failures respectively)
       n -- Number of trials (ignored if x has length 2)
       p -- Hypothesized probability of success

    Returns pval -- Probability that null test is rejected for this set
                    of x and n even though it is true.
    """
    x = atleast_1d(x)
    if len(x) == 2:
        n = x[1]+x[0]
        x = x[0]
    elif len(x) == 1:
        x = x[0]
        if n is None or n < x:
            raise ValueError, "n must be >= x"
    else:
        raise ValueError, "Incorrect length for x."

    if (p > 1.0) or (p < 0.0):
        raise ValueError, "p must be in range [0,1]"

    d = distributions.binom.pmf(x,n,p)
    rerr = 1+1e-7
    if (x*1.0/n < p):
        i = arange(x+1,n+1)
        y = sum(distributions.binom.pmf(i,n,p) <= d*rerr)
        pval = distributions.binom.cdf(x,n,p) + distributions.binom.sf(n-y,n,p)
    else:
        i = arange(0,x)
        y = sum(distributions.binom.pmf(i,n,p) <= d*rerr)
        pval = distributions.binom.cdf(y-1,n,p) + distributions.binom.sf(x-1,n,p)

    return min(1.0,pval)
def bartlett(*args):
    """Perform Bartlett test with the null hypothesis that all input samples
    have equal variances.

    Inputs are sample vectors:  bartlett(x,y,z,...) 

    Outputs: (T, pval)

         T    -- the Test statistic
         pval -- significance level if null is rejected with this value of T
                 (prob. that null is true but rejected with this p-value.)  

    Sensitive to departures from normality.  The Levene test is
    an alternative that is less sensitive to departures from
    normality.

    References:

      http://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm

      Snedecor, George W. and Cochran, William G. (1989), Statistical
        Methods, Eighth Edition, Iowa State University Press.     
    """
    k = len(args)
    if k < 2:
        raise ValueError, "Must enter at least two input sample vectors."
    Ni = zeros(k)
    ssq = zeros(k,'d')
    for j in range(k):
        Ni[j] = len(args[j])
        ssq[j] = stats.var(args[j])
    Ntot = sum(Ni)
    spsq = sum((Ni-1)*ssq)/(1.0*(Ntot-k))
    numer = (Ntot*1.0-k)*log(spsq) - sum((Ni-1.0)*log(ssq))
    denom = 1.0 + (1.0/(3*(k-1)))*((sum(1.0/(Ni-1.0)))-1.0/(Ntot-k))
    T = numer / denom
    pval = distributions.chi2.sf(T,k-1) # 1 - cdf
    return T, pval
def oneway(*args,**kwds):
    """Test for equal means in two or more samples from the
    normal distribution.

    If the keyword parameter <equal_var> is true then the variances
    are assumed to be equal, otherwise they are not assumed to
    be equal (default).

    Return test statistic and the p-value giving the probability
    of error if the null hypothesis (equal means) is rejected at this value.
    """
    k = len(args)
    if k < 2:
        raise ValueError, "Must enter at least two input sample vectors."
    if 'equal_var' in kwds.keys():
        if kwds['equal_var']: evar = 1
        else: evar = 0
    else:
        evar = 0

    Ni = array([len(args[i]) for i in range(k)])
    Mi = array([stats.mean(args[i]) for i in range(k)])
    Vi = array([stats.var(args[i]) for i in range(k)])
    Wi = Ni / Vi
    swi = sum(Wi)
    N = sum(Ni)
    my = sum(Mi*Ni)*1.0/N
    tmp = sum((1-Wi/swi)**2 / (Ni-1.0))/(k*k-1.0)
    if evar:
        F = ((sum(Ni*(Mi-my)**2) / (k-1.0)) / (sum((Ni-1.0)*Vi) / (N-k)))
        pval = distributions.f.sf(F,k-1,N-k)  # 1-cdf
    else:
        m = sum(Wi*Mi)*1.0/swi
        F = sum(Wi*(Mi-m)**2) / ((k-1.0)*(1+2*(k-2)*tmp))
        pval = distributions.f.sf(F,k-1.0,1.0/(3*tmp))

    return F, pval
def anderson(x,dist='norm'):
    """Anderson and Darling test for normal, exponential, or Gumbel
    (Extreme Value Type I) distribution.

    Given samples x, return A2, the Anderson-Darling statistic,
    the significance levels in percentages, and the corresponding
    critical values.

    Critical values provided are for the following significance levels
    norm/expon:   15%, 10%, 5%, 2.5%, 1%
    Gumbel:       25%, 10%, 5%, 2.5%, 1%
    logistic:     25%, 10%, 5%, 2.5%, 1%, 0.5%

    If A2 is larger than these critical values then for that significance
    level, the hypothesis that the data come from a normal (exponential)
    can be rejected.
    """
    if not dist in ['norm','expon','gumbel','extreme1','logistic']:
        raise ValueError, "Invalid distribution."
    y = sort(x)
    xbar = stats.mean(x)
    N = len(y)
    if dist == 'norm':
        s = stats.std(x)
        w = (y-xbar)/s
        z = distributions.norm.cdf(w)
        sig = array([15,10,5,2.5,1])
        critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N),3)
    elif dist == 'expon':
        w = y / xbar
        z = distributions.expon.cdf(w)
        sig = array([15,10,5,2.5,1])
        critical = around(_Avals_expon / (1.0 + 0.6/N),3)
    elif dist == 'logistic':
        def rootfunc(ab,xj,N):
            a,b = ab
            tmp = (xj-a)/b
            tmp2 = exp(tmp)
            val = [sum(1.0/(1+tmp2))-0.5*N,
                   sum(tmp*(1.0-tmp2)/(1+tmp2))+N]
            return array(val)
        sol0=array([xbar,stats.std(x)])
        sol = optimize.fsolve(rootfunc,sol0,args=(x,N),xtol=1e-5)
        w = (y-sol[0])/sol[1]
        z = distributions.logistic.cdf(w)
        sig = array([25,10,5,2.5,1,0.5])
        critical = around(_Avals_logistic / (1.0+0.25/N),3)
    else:
        def fixedsolve(th,xj,N):
            val = stats.sum(xj)*1.0/N
            tmp = exp(-xj/th)
            term = sum(xj*tmp)
            term /= sum(tmp)
            return val - term
        s = optimize.fixed_point(fixedsolve, 1.0, args=(x,N),xtol=1e-5)
        xbar = -s*log(sum(exp(-x/s))*1.0/N)
        w = (y-xbar)/s
        z = distributions.gumbel_l.cdf(w)
        sig = array([25,10,5,2.5,1])
        critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)),3)
    i = arange(1,N+1)
    S = sum((2*i-1.0)/N*(log(z)+log(1-z[::-1])))
    A2 = -N-S
    return A2, critical, sig
 def fixedsolve(th,xj,N):
     val = stats.sum(xj)*1.0/N
     tmp = exp(-xj/th)
     term = sum(xj*tmp)
     term /= sum(tmp)
     return val - term
def ansari(x,y):
    """Determine if the scale parameter for two distributions with equal
    medians is the same using the Ansari-Bradley statistic.

    Specifically, compute the AB statistic and the probability of error
    that the null hypothesis is true but rejected with the computed
    statistic as the critical value.

    One can reject the null hypothesis that the ratio of variances is 1 if
    returned probability of error is small (say < 0.05)
    """
    x,y = asarray(x),asarray(y)
    n = len(x)
    m = len(y)
    if (m < 1):
        raise ValueError, "Not enough other observations."
    if (n < 1):
        raise ValueError, "Not enough test observations."
    N = m+n
    xy = r_[x,y]  # combine
    rank = stats.rankdata(xy)
    symrank = amin(array((rank,N-rank+1)),0)
    AB = sum(symrank[:n])
    uxy = unique(xy)
    repeats = (len(uxy) != len(xy))
    exact = ((m<55) and (n<55) and not repeats)
    if repeats and ((m < 55)  or (n < 55)):
        print "Ties preclude use of exact statistic."
    if exact:
        astart, a1, ifault = statlib.gscale(n,m)
        ind = AB-astart
        total = sum(a1)
        if ind < len(a1)/2.0:
            cind = int(ceil(ind))
            if (ind == cind):
                pval = 2.0*sum(a1[:cind+1])/total
            else:
                pval = 2.0*sum(a1[:cind])/total
        else:
            find = int(floor(ind))
            if (ind == floor(ind)):
                pval = 2.0*sum(a1[find:])/total
            else:
                pval = 2.0*sum(a1[find+1:])/total
        return AB, min(1.0,pval)
    
    # otherwise compute normal approximation
    if N % 2:  # N odd
        mnAB = n*(N+1.0)**2 / 4.0 / N
        varAB = n*m*(N+1.0)*(3+N**2)/(48.0*N**2)
    else:
        mnAB = n*(N+2.0)/4.0
        varAB = m*n*(N+2)*(N-2.0)/48/(N-1.0)
    if repeats:   # adjust variance estimates
        # compute sum(tj * rj**2)
        fac = sum(symrank**2)
        if N % 2: # N odd
            varAB = m*n*(16*N*fac-(N+1)**4)/(16.0 * N**2 * (N-1))
        else:  # N even
            varAB = m*n*(16*fac-N*(N+2)**2)/(16.0 * N * (N-1))
    z = (AB - mnAB)/sqrt(varAB)
    pval = (1-distributions.norm.cdf(abs(z)))*2.0
    return AB, pval
def fligner(*args,**kwds):
    """Perform Levene test with the null hypothesis that all input samples
    have equal variances.

    Inputs are sample vectors:  bartlett(x,y,z,...)

    One keyword input, center, can be used with values
        center = 'mean', center='median' (default), center='trimmed'

    Outputs: (Xsq, pval)

         Xsq  -- the Test statistic
         pval -- significance level if null is rejected with this value of X
                 (prob. that null is true but rejected with this p-value.)  

    References:

       http://www.stat.psu.edu/~bgl/center/tr/TR993.ps

       Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample
       tests for scale. 'Journal of the American Statistical Association.'
       71(353), 210-213.
    """
    k = len(args)
    if k < 2:
        raise ValueError, "Must enter at least two input sample vectors."
    if 'center' in kwds.keys():
        center = kwds['center']
    else:
        center = 'median'
    if not center in ['mean','median','trimmed']:
        raise ValueError, "Keyword argument <center> must be 'mean', 'median'"\
              + "or 'trimmed'."
    if center == 'median':
        func = stats.median
    elif center == 'mean':
        func = stats.mean
    else:
        func = stats.trim_mean

    Ni = asarray([len(args[j]) for j in range(k)])
    Yci = asarray([func(args[j]) for j in range(k)])
    Ntot = sum(Ni)
    # compute Zij's
    Zij = [abs(asarray(args[i])-Yci[i]) for i in range(k)]
    allZij = []
    g = [0]
    for i in range(k):
        allZij.extend(list(Zij[i]))
        g.append(len(allZij))

    a = distributions.norm.ppf(stats.rankdata(allZij)/(2*(Ntot+1.0)) + 0.5)

    # compute Aibar
    Aibar = _apply_func(a,g,sum) / Ni
    anbar = stats.mean(a)
    varsq = stats.var(a)

    Xsq = sum(Ni*(asarray(Aibar)-anbar)**2.0)/varsq

    pval = distributions.chi2.sf(Xsq,k-1) # 1 - cdf
    return Xsq, pval
def levene(*args,**kwds):
    """Perform Levene test with the null hypothesis that all input samples
    have equal variances.

    Inputs are sample vectors:  bartlett(x,y,z,...)

    One keyword input, center, can be used with values
        center = 'mean', center='median' (default), center='trimmed'

    center='median' is recommended for skewed (non-normal) distributions
    center='mean' is recommended for symmetric, moderate-tailed, dist.
    center='trimmed' is recommended for heavy-tailed distributions.

    Outputs: (W, pval)

         W    -- the Test statistic
         pval -- significance level if null is rejected with this value of W
                 (prob. that null is true but rejected with this p-value.)  

    References:

       http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm

       Levene, H. (1960). In Contributions to Probability and Statistics:
         Essays in Honor of Harold Hotelling, I. Olkin et al. eds.,
         Stanford University Press, pp. 278-292. 
       Brown, M. B. and Forsythe, A. B. (1974), Journal of the American
         Statistical Association, 69, 364-367         
    """
    k = len(args)
    if k < 2:
        raise ValueError, "Must enter at least two input sample vectors."
    Ni = zeros(k)
    Yci = zeros(k,'d')
    if 'center' in kwds.keys():
        center = kwds['center']
    else:
        center = 'median'
    if not center in ['mean','median','trimmed']:
        raise ValueError, "Keyword argument <center> must be 'mean', 'median'"\
              + "or 'trimmed'."
    if center == 'median':
        func = stats.median
    elif center == 'mean':
        func = stats.mean
    else:
        func = stats.trim_mean
    for j in range(k):
        Ni[j] = len(args[j])
        Yci[j] = func(args[j])    
    Ntot = sum(Ni)

    # compute Zij's
    Zij = [None]*k
    for i in range(k):
        Zij[i] = abs(asarray(args[i])-Yci[i])
    # compute Zbari
    Zbari = zeros(k,'d')
    Zbar = 0.0
    for i in range(k):
        Zbari[i] = stats.mean(Zij[i])
        Zbar += Zbari[i]*Ni[i]
    Zbar /= Ntot

    numer = (Ntot-k)*sum(Ni*(Zbari-Zbar)**2)

    # compute denom_variance
    dvar = 0.0
    for i in range(k):
        dvar += sum((Zij[i]-Zbari[i])**2)

    denom = (k-1.0)*dvar

    W = numer / denom
    pval = distributions.f.sf(W,k-1,Ntot-k) # 1 - cdf
    return W, pval