def bayes_mvs(data,alpha=0.90):
    """Return Bayesian confidence intervals for the mean, var, and std.

    Assumes 1-d data all has same mean and variance and uses Jeffrey's prior
    for variance and std.

    alpha gives the probability that the returned interval contains
    the true parameter.

    Uses peak of conditional pdf as starting center.

    Returns (peak, (a, b)) for each of mean, variance and standard deviation.
    """
    x = ravel(data)
    n = len(x)
    assert(n > 1)
    n = float(n)
    xbar = sb.add.reduce(x)/n
    C = sb.add.reduce(x*x)/n - xbar*xbar
    #
    fac = sqrt(C/(n-1))
    tval = distributions.t.ppf((1+alpha)/2.0,n-1)
    delta = fac*tval
    ma = xbar - delta
    mb = xbar + delta
    mp = xbar
    #
    fac = n*C/2.0
    peak = 2/(n+1.)
    a = (n-1)/2.0    
    F_peak = distributions.invgamma.cdf(peak,a)
    q1 = F_peak - alpha/2.0
    q2 = F_peak + alpha/2.0
    if (q1 < 0):  # non-symmetric area
        q2 = alpha
        va = 0.0
    else:
        va = fac*distributions.invgamma.ppf(q1,a)
    vb = fac*distributions.invgamma.ppf(q2,a)
    vp = peak*fac
    #
    fac = sqrt(fac)
    peak = sqrt(2./n)
    F_peak = distributions.gengamma.cdf(peak,a,-2)
    q1 = F_peak - alpha/2.0
    q2 = F_peak + alpha/2.0
    if (q1 < 0):
        q2 = alpha
        sta = 0.0
    else:
        sta = fac*distributions.gengamma.ppf(q1,a,-2)
    stb = fac*distributions.gengamma.ppf(q2,a,-2)
    stp = peak*fac
        
    return (mp,(ma,mb)),(vp,(va,vb)),(stp,(sta,stb))
def pdf_moments(cnt):
    """Return the Gaussian expanded pdf function given the list of central
    moments (first one is mean).
    """
    N = len(cnt)
    if N < 2:
        raise ValueError, "At least two moments must be given to" + \
              "approximate the pdf."
    totp = poly1d(1)
    sig = sqrt(cnt[1])
    mu = cnt[0]
    if N > 2:
        Dvals = _hermnorm(N+1)
    for k in range(3,N+1):
        # Find Ck
        Ck = 0.0
        for n in range((k-3)/2):
            m = k-2*n
            if m % 2: # m is odd
                momdiff = cnt[m-1]
            else:
                momdiff = cnt[m-1] - sig*sig*scipy.factorial2(m-1)
            Ck += Dvals[k][m] / sig**m * momdiff
        # Add to totp 
        totp = totp +  Ck*Dvals[k]        
        
    def thisfunc(x):
        xn = (x-mu)/sig
        return totp(xn)*exp(-xn*xn/2.0)/sqrt(2*pi)/sig
    return thisfunc
def wilcoxon(x,y=None):
    """
Calculates the Wilcoxon signed-rank test for the null hypothesis that two samples come from the same distribution. A non-parametric T-test. (need N > 20)

Returns: t-statistic, two-tailed p-value
"""
    if y is None:
        d = x
    else:
        x, y = map(asarray, (x, y))
        if len(x) <> len(y):
            raise ValueError, 'Unequal N in wilcoxon.  Aborting.'
        d = x-y
    d = compress(not_equal(d,0),d) # Keep all non-zero differences
    count = len(d)
    if (count < 10):
        print "Warning: sample size too small for normal approximation."
    r = stats.rankdata(abs(d))
    r_plus = sum((d > 0)*r)
    r_minus = sum((d < 0)*r)
    T = min(r_plus, r_minus)
    mn = count*(count+1.0)*0.25
    se = math.sqrt(count*(count+1)*(2*count+1.0)/24)
    if (len(r) != len(unique(r))):  # handle ties in data
        replist, repnum = find_repeats(r)
        corr = 0.0
        for i in range(len(replist)):
            si = repnum[i]
            corr += 0.5*si*(si*si-1.0)
        V = se*se - corr
        se = sqrt((count*V - T*T)/(count-1.0))
    z = (T - mn)/se
    prob = 2*(1.0 -stats.zprob(abs(z)))
    return T, prob
def mood(x,y):
    """Determine if the scale parameter for two distributions with equal
    medians is the same using a Mood test.

    Specifically, compute the z statistic and the probability of error
    that the null hypothesis is true but rejected with the computed
    statistic as the critical value.

    One can reject the null hypothesis that the ratio of scale parameters is
    1 if the returned probability of error is small (say < 0.05)
    """
    n = len(x)
    m = len(y)
    xy = r_[x,y]
    N = m+n
    if (N < 3):
        raise ValueError, "Not enough observations."
    ranks = stats.rankdata(xy)
    Ri = ranks[:n]
    M = sum((Ri - (N+1.0)/2)**2)
    # Approx stat.
    mnM = n*(N*N-1.0)/12
    varM = m*n*(N+1.0)*(N+2)*(N-2)/180
    z = (M-mnM)/sqrt(varM)
    p = distributions.norm.cdf(z)
    pval = 2*min(p,1-p)
    return z, pval
def circstd(samples, high=2*pi, low=0):
    """Compute the circular standard deviation for samples assumed to be in the range [low to high]
    """
    ang = (samples - low)*2*pi / (high-low)
    res = stats.mean(exp(1j*ang))
    V = 1-abs(res)
    return ((high-low)/2.0/pi) * sqrt(V)
def factorial2(n,exact=0):
    """n!! = special.gamma(n/2+1)*2**((m+1)/2)/sqrt(pi)  n odd
           = 2**(n) * n!                                 n even

    If exact==0, then floating point precision is used, otherwise
    exact long integer is computed.

    Notes:    
      - Array argument accepted only for exact=0 case.
      - If n<0, the return value is 0.
    """
    if exact:
        if n < -1:
            return 0L
        if n <= 0:
            return 1L
        n = long(n)
        val = 1L
        k = n
        while (k > 0):
            val = val*k
            k -= 2
        return val
    else:
        n = asarray(n)
        vals = zeros(n.shape,'d')
        cond1 = (n % 2) & (n >= -1)
        cond2 = (1-(n % 2)) & (n >= -1)
        oddn = extract(cond1,n)
        evenn = extract(cond2,n)
        nd2o = oddn / 2.0
        nd2e = evenn / 2.0
        insert(vals,cond1,special.gamma(nd2o+1)/sqrt(pi)*pow(2.0,nd2o+0.5))
        insert(vals,cond2,special.gamma(nd2e+1) * pow(2.0,nd2e))
        return vals
Exemple #7
0
def norm(x, ord=2):
    """ norm(x, ord=2) -> n

    Matrix and vector norm.

    Inputs:

      x -- a rank-1 (vector) or rank-2 (matrix) array
      ord -- the order of norm.

     Comments:

       For vectors ord can be any real number including Inf or -Inf.
         ord = Inf, computes the maximum of the magnitudes
         ord = -Inf, computes minimum of the magnitudes
         ord is finite, computes sum(abs(x)**ord)**(1.0/ord)

       For matrices ord can only be + or - 1, 2, Inf.
         ord = 2 computes the largest singular value
         ord = -2 computes the smallest singular value
         ord = 1 computes the largest column sum of absolute values
         ord = -1 computes the smallest column sum of absolute values
         ord = Inf computes the largest row sum of absolute values
         ord = -Inf computes the smallest row sum of absolute values
         ord = 'fro' computes the frobenius norm sqrt(sum(diag(X.H * X)))
    """
    x = asarray_chkfinite(x)
    nd = len(x.shape)
    Inf = scipy_base.Inf
    if nd == 1:
        if ord == Inf:
            return scipy_base.amax(abs(x))
        elif ord == -Inf:
            return scipy_base.amin(abs(x))
        else:
            return scipy_base.sum(abs(x)**ord)**(1.0/ord)
    elif nd == 2:
        if ord == 2:
            return scipy_base.amax(decomp.svd(x,compute_uv=0))
        elif ord == -2:
            return scipy_base.amin(decomp.svd(x,compute_uv=0))
        elif ord == 1:
            return scipy_base.amax(scipy_base.sum(abs(x)))
        elif ord == Inf:
            return scipy_base.amax(scipy_base.sum(abs(x),axis=1))
        elif ord == -1:
            return scipy_base.amin(scipy_base.sum(abs(x)))
        elif ord == -Inf:
            return scipy_base.amin(scipy_base.sum(abs(x),axis=1))
        elif ord in ['fro','f']:
            val = real((conjugate(x)*x).flat)
            return sqrt(add.reduce(val))
        else:
            raise ValueError, "Invalid norm order for matrices."
    else:
        raise ValueError, "Improper number of dimensions to norm."
def pdf_fromgamma(g1,g2,g3=0.0,g4=None):
    if g4 is None:
        g4 = 3*g2*g2
    sigsq = 1.0/g2
    sig = sqrt(sigsq)
    mu = g1*sig**3.0
    p12 = _hermnorm(13)
    for k in range(13):
        p12[k] = p12[k]/sig**k

    # Add all of the terms to polynomial
    totp = p12[0] - (g1/6.0*p12[3]) + \
           (g2/24.0*p12[4] +g1*g1/72.0*p12[6]) - \
           (g3/120.0*p12[5] + g1*g2/144.0*p12[7] + g1**3.0/1296.0*p12[9]) + \
           (g4/720*p12[6] + (g2*g2/1152.0+g1*g3/720)*p12[8] +
            g1*g1*g2/1728.0*p12[10] + g1**4.0/31104.0*p12[12])
    # Final normalization
    totp = totp / sqrt(2*pi)/sig
    def thefunc(x):
        xn = (x-mu)/sig
        return totp(xn)*exp(-xn*xn/2.0)
    return thefunc
 def thisfunc(x):
     xn = (x-mu)/sig
     return totp(xn)*exp(-xn*xn/2.0)/sqrt(2*pi)/sig
def ansari(x,y):
    """Determine if the scale parameter for two distributions with equal
    medians is the same using the Ansari-Bradley statistic.

    Specifically, compute the AB statistic and the probability of error
    that the null hypothesis is true but rejected with the computed
    statistic as the critical value.

    One can reject the null hypothesis that the ratio of variances is 1 if
    returned probability of error is small (say < 0.05)
    """
    x,y = asarray(x),asarray(y)
    n = len(x)
    m = len(y)
    if (m < 1):
        raise ValueError, "Not enough other observations."
    if (n < 1):
        raise ValueError, "Not enough test observations."
    N = m+n
    xy = r_[x,y]  # combine
    rank = stats.rankdata(xy)
    symrank = amin(array((rank,N-rank+1)),0)
    AB = sum(symrank[:n])
    uxy = unique(xy)
    repeats = (len(uxy) != len(xy))
    exact = ((m<55) and (n<55) and not repeats)
    if repeats and ((m < 55)  or (n < 55)):
        print "Ties preclude use of exact statistic."
    if exact:
        astart, a1, ifault = statlib.gscale(n,m)
        ind = AB-astart
        total = sum(a1)
        if ind < len(a1)/2.0:
            cind = int(ceil(ind))
            if (ind == cind):
                pval = 2.0*sum(a1[:cind+1])/total
            else:
                pval = 2.0*sum(a1[:cind])/total
        else:
            find = int(floor(ind))
            if (ind == floor(ind)):
                pval = 2.0*sum(a1[find:])/total
            else:
                pval = 2.0*sum(a1[find+1:])/total
        return AB, min(1.0,pval)
    
    # otherwise compute normal approximation
    if N % 2:  # N odd
        mnAB = n*(N+1.0)**2 / 4.0 / N
        varAB = n*m*(N+1.0)*(3+N**2)/(48.0*N**2)
    else:
        mnAB = n*(N+2.0)/4.0
        varAB = m*n*(N+2)*(N-2.0)/48/(N-1.0)
    if repeats:   # adjust variance estimates
        # compute sum(tj * rj**2)
        fac = sum(symrank**2)
        if N % 2: # N odd
            varAB = m*n*(16*N*fac-(N+1)**4)/(16.0 * N**2 * (N-1))
        else:  # N even
            varAB = m*n*(16*fac-N*(N+2)**2)/(16.0 * N * (N-1))
    z = (AB - mnAB)/sqrt(varAB)
    pval = (1-distributions.norm.cdf(abs(z)))*2.0
    return AB, pval
def anderson(x,dist='norm'):
    """Anderson and Darling test for normal, exponential, or Gumbel
    (Extreme Value Type I) distribution.

    Given samples x, return A2, the Anderson-Darling statistic,
    the significance levels in percentages, and the corresponding
    critical values.

    Critical values provided are for the following significance levels
    norm/expon:   15%, 10%, 5%, 2.5%, 1%
    Gumbel:       25%, 10%, 5%, 2.5%, 1%
    logistic:     25%, 10%, 5%, 2.5%, 1%, 0.5%

    If A2 is larger than these critical values then for that significance
    level, the hypothesis that the data come from a normal (exponential)
    can be rejected.
    """
    if not dist in ['norm','expon','gumbel','extreme1','logistic']:
        raise ValueError, "Invalid distribution."
    y = sort(x)
    xbar = stats.mean(x)
    N = len(y)
    if dist == 'norm':
        s = stats.std(x)
        w = (y-xbar)/s
        z = distributions.norm.cdf(w)
        sig = array([15,10,5,2.5,1])
        critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N),3)
    elif dist == 'expon':
        w = y / xbar
        z = distributions.expon.cdf(w)
        sig = array([15,10,5,2.5,1])
        critical = around(_Avals_expon / (1.0 + 0.6/N),3)
    elif dist == 'logistic':
        def rootfunc(ab,xj,N):
            a,b = ab
            tmp = (xj-a)/b
            tmp2 = exp(tmp)
            val = [sum(1.0/(1+tmp2))-0.5*N,
                   sum(tmp*(1.0-tmp2)/(1+tmp2))+N]
            return array(val)
        sol0=array([xbar,stats.std(x)])
        sol = optimize.fsolve(rootfunc,sol0,args=(x,N),xtol=1e-5)
        w = (y-sol[0])/sol[1]
        z = distributions.logistic.cdf(w)
        sig = array([25,10,5,2.5,1,0.5])
        critical = around(_Avals_logistic / (1.0+0.25/N),3)
    else:
        def fixedsolve(th,xj,N):
            val = stats.sum(xj)*1.0/N
            tmp = exp(-xj/th)
            term = sum(xj*tmp)
            term /= sum(tmp)
            return val - term
        s = optimize.fixed_point(fixedsolve, 1.0, args=(x,N),xtol=1e-5)
        xbar = -s*log(sum(exp(-x/s))*1.0/N)
        w = (y-xbar)/s
        z = distributions.gumbel_l.cdf(w)
        sig = array([25,10,5,2.5,1])
        critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)),3)
    i = arange(1,N+1)
    S = sum((2*i-1.0)/N*(log(z)+log(1-z[::-1])))
    A2 = -N-S
    return A2, critical, sig