Beispiel #1
0
def linregress(x:List(float),y:List(float))->(float,float,float,float,float):
    """
Calculates a regression line on x,y pairs.

Usage:   llinregress(x,y)      x,y are equal-length lists of x-y coordinates
Returns: slope, intercept, r, two-tailed prob, sterr-of-estimate
"""
    TINY = 1.0e-20
    if len(x) != len(y):
        raise ValueError('Input values not paired in linregress.  Aborting.')
    n = len(x)
    x = list(map(float,x))
    y = list(map(float,y))
    xmean = central_tendency.mean(x)
    ymean = central_tendency.mean(y)
    r_num = float(n*(support.summult(x,y)) - sum(x)*sum(y))
    r_den = sqrt((n*support.ss(x) - support.square_of_sums(x))*(n*support.ss(y)-support.square_of_sums(y)))
    r = r_num / r_den
    z = 0.5*log((1.0+r+TINY)/(1.0-r+TINY))
    df = n-2
    t = r*sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
    prob = probability.betai(0.5*df,0.5,df/(df+t*t))
    slope = r_num / float(n*support.ss(x) - support.square_of_sums(x))
    intercept = ymean - slope*xmean
    sterrest = sqrt(1-r*r)*variability.samplestdev(y)
    return slope, intercept, r, prob, sterrest
Beispiel #2
0
def kendalltau(x:List(float),y:List(float))->(float,float):
    """
Calculates Kendall's tau ... correlation of ordinal data.  Adapted
from function kendl1 in Numerical Recipies.  Needs good test-routine.@@@

Usage:   lkendalltau(x,y)
Returns: Kendall's tau, two-tailed p-value
"""
    n1 = 0
    n2 = 0
    iss = 0
    for j in range(len(x)-1):
        for k in range(j,len(y)):
            a1 = x[j] - x[k]
            a2 = y[j] - y[k]
            aa = a1 * a2
            if (aa):             # neither list has a tie
                n1 = n1 + 1
                n2 = n2 + 1
                if aa > 0:
                    iss = iss + 1
                else:
                    iss = iss -1
            else:
                if (a1):
                    n1 = n1 + 1
                else:
                    n2 = n2 + 1
    tau = iss / sqrt(n1*n2)
    svar = (4.0*len(x)+10.0) / (9.0*len(x)*(len(x)-1))
    _z = tau / sqrt(svar)
    prob = probability.erfcc(abs(_z)/1.4142136)
    return tau, prob
def mannwhitneyu(x: List(float), y: List(float)) -> (float, float):
    """
Calculates a Mann-Whitney U statistic on the provided scores and
returns the result.  Use only when the n in each condition is < 20 and
you have 2 independent samples of ranks.  NOTE: Mann-Whitney U is
significant if the u-obtained is LESS THAN or equal to the critical
value of U found in the tables.  Equivalent to Kruskal-Wallis H with
just 2 groups.

Usage:   lmannwhitneyu(data)
Returns: u-statistic, one-tailed p-value (i.e., p(z(U)))
"""
    n1 = len(x)
    n2 = len(y)
    ranked = support.rankdata(x + y)
    rankx = ranked[0:n1]  # get the x-ranks
    ranky = ranked[n1:]  # the rest are y-ranks
    u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - sum(rankx)  # calc U for x
    u2 = n1 * n2 - u1  # remainder is U for y
    bigu = max(u1, u2)
    smallu = min(u1, u2)
    proportion = bigu / float(n1 * n2)
    T = sqrt(tiecorrect(ranked))  # correction factor for tied scores
    if T == 0:
        raise ValueError('All numbers are identical in lmannwhitneyu')
    sd = sqrt(T * n1 * n2 * (n1 + n2 + 1) / 12.0)
    z = abs((bigu - n1 * n2 / 2.0) / sd)  # normal approximation for prob calc
    return smallu, 1.0 - probability.zprob(z)  #, proportion
Beispiel #4
0
def pointbiserialr(x:List(float),y:List(float))->(float,float):
    """
Calculates a point-biserial correlation coefficient and the associated
probability value.  Taken from Heiman's Basic Statistics for the Behav.
Sci (1st), p.194.

Usage:   lpointbiserialr(x,y)      where x,y are equal-length lists
Returns: Point-biserial r, two-tailed p-value
"""
    TINY = 1e-30
    if len(x) != len(y):
        raise ValueError('INPUT VALUES NOT PAIRED IN pointbiserialr.  ABORTING.')
    data = pstat.abut(x,y)
    categories = pstat.unique(x)
    if len(categories) != 2:
        raise ValueError("Exactly 2 categories required for pointbiserialr().")
    else:   # there are 2 categories, continue
        codemap = pstat.abut(categories,list(range(2)))
        recoded = pstat.recode(data,codemap,0)
        _x = pstat.linexand(data,0,categories[0])
        _y = pstat.linexand(data,0,categories[1])
        xmean = central_tendency.mean(pstat.colex(_x,1))
        ymean = central_tendency.mean(pstat.colex(_y,1))
        n = len(data)
        adjust = sqrt((len(_x)/float(n))*(len(_y)/float(n)))
        rpb = (ymean - xmean)/variability.samplestdev(pstat.colex(data,1))*adjust
        df = n-2
        t = rpb*sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY)))
        prob = probability.betai(0.5*df,0.5,df/(df+t*t))  # t already a float
        return rpb, prob
Beispiel #5
0
def chisqprob(chisq:float,df:int)->float:
    """
Returns the (1-tailed) probability value associated with the provided
chi-square value and df.  Adapted from chisq.c in Gary Perlman's |Stat.

Usage:   lchisqprob(chisq,df)
"""
    BIG = 20.0
    if chisq <=0 or df < 1:
        return 1.0
    a = 0.5 * chisq
    if df%2 == 0:
        even = 1
    else:
        even = 0
    if df > 1:
        y = ex(-a,BIG)
    if even:
        s = y
    else:
        s = 2.0 * zprob(-sqrt(chisq))
    if (df > 2):
        chisq = 0.5 * (df - 1.0)
        if even:
            _z = 1.0
        else:
            _z = 0.5
        if a > BIG:
            if even:
                e = 0.0
            else:
                e = log(sqrt(pi))
            c = log(a)
            while (_z <= chisq):
                e = log(_z) + e
                s = s + ex(c*_z-a-e,BIG)
                _z = _z + 1.0
            return s
        else:
            if even:
                e = 1.0
            else:
                e = 1.0 / sqrt(pi) / sqrt(a)
            c = 0.0
            while (_z <= chisq):
                e = e * (a/float(_z))
                c = c + e
                _z = _z + 1.0
            return (c*y+s)
    else:
        return s
def ttest_ind(a: List(float), b: List(float)) -> (float, float):
    """
Calculates the t-obtained T-test on TWO INDEPENDENT samples of
scores a, and b.  From Numerical Recipies, p.483.  If printit=1, results
are printed to the screen.  If printit='filename', the results are output
to 'filename' using the given writemode (default=append).  Returns t-value,
and prob.

Usage:   lttest_ind(a,b,printit=0,name1='Samp1',name2='Samp2',writemode='a')
Returns: t-value, two-tailed prob
"""
    printit = 0
    name1 = 'Samp1'
    name2 = 'Samp2'
    writemode = 'a'
    #bg: optional args
    x1 = central_tendency.mean(a)
    x2 = central_tendency.mean(b)
    v1 = variability.stdev(a)**2
    v2 = variability.stdev(b)**2
    n1 = len(a)
    n2 = len(b)
    df = n1 + n2 - 2
    svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df)
    t = (x1 - x2) / sqrt(svar * (1.0 / n1 + 1.0 / n2))
    prob = probability.betai(0.5 * df, 0.5, df / (df + t * t))

    if printit != 0:
        statname = 'Independent samples T-test.'
        outputpairedstats(printit, writemode, name1, n1,
                          x1, v1, min(a), max(a), name2, n2, x2, v2, min(b),
                          max(b), statname, t, prob)
    return t, prob
def ttest_1samp(a: List(float), popmean: int) -> (float, float):
    """
Calculates the t-obtained for the independent samples T-test on ONE group
of scores a, given a population mean.  If printit=1, results are printed
to the screen.  If printit='filename', the results are output to 'filename'
using the given writemode (default=append).  Returns t-value, and prob.

Usage:   lttest_1samp(a,popmean,Name='Sample',printit=0,writemode='a')
Returns: t-value, two-tailed prob
"""
    printit = 0  #bg: optional arg
    name = 'Sample'  #bg: optional arg
    writemode = 'a'  #bg: optional arg
    x = central_tendency.mean(a)
    v = variability.var(a)
    n = len(a)
    df = n - 1
    svar = ((n - 1) * v) / float(df)
    t = (x - popmean) / sqrt(svar * (1.0 / n))
    prob = probability.betai(0.5 * df, 0.5, float(df) / (df + t * t))

    if printit != 0:
        statname = 'Single-sample T-test.'
        outputpairedstats(printit, writemode,
                          'Population', '--', popmean, 0, 0, 0, name, n, x, v,
                          min(a), max(a), statname, t, prob)
    return t, prob
def wilcoxont(x: List(float), y: List(float)) -> (float, float):
    """
Calculates the Wilcoxon T-test for related samples and returns the
result.  A non-parametric T-test.

Usage:   lwilcoxont(x,y)
Returns: a t-statistic, two-tail probability estimate
"""
    if len(x) != len(y):
        raise ValueError('Unequal N in wilcoxont.  Aborting.')
    d = []
    for i in range(len(x)):
        diff = x[i] - y[i]
        if diff != 0:
            d.append(diff)
    count = len(d)
    absd = list(map(abs, d))
    absranked = support.rankdata(absd)
    r_plus = 0.0
    r_minus = 0.0
    for i in range(len(absd)):
        if d[i] < 0:
            r_minus = r_minus + absranked[i]
        else:
            r_plus = r_plus + absranked[i]
    wt = min(r_plus, r_minus)
    mn = count * (count + 1) * 0.25
    se = sqrt(count * (count + 1) * (2.0 * count + 1.0) / 24.0)
    _z = fabs(wt - mn) / se
    prob = 2 * (1.0 - probability.zprob(abs(_z)))
    return wt, prob
def stdev(inlist: List(float)) -> float:
    """
Returns the standard deviation of the values in the passed list
using N-1 in the denominator (i.e., to estimate population stdev).

Usage:   lstdev(inlist)
"""
    return sqrt(var(inlist))
def sterr(inlist: List(float)) -> float:
    """
Returns the standard error of the values in the passed list using N-1
in the denominator (i.e., to estimate population standard error).

Usage:   lsterr(inlist)
"""
    return stdev(inlist) / float(sqrt(len(inlist)))
def samplestdev(inlist: List(float)) -> float:
    """
Returns the standard deviation of the values in the passed list using
N for the denominator (i.e., DESCRIBES the sample stdev only).

Usage:   lsamplestdev(inlist)
"""
    return sqrt(samplevar(inlist))
def sem(inlist: List(float)) -> float:
    """
Returns the estimated standard error of the mean (sx-bar) of the
values in the passed list.  sem = stdev / sqrt(n)

Usage:   lsem(inlist)
"""
    sd = stdev(inlist)
    n = len(inlist)
    return sd / sqrt(n)
Beispiel #13
0
def pearsonr(x:List(float),y:List(float))->(float,float):
    """
Calculates a Pearson correlation coefficient and the associated
probability value.  Taken from Heiman's Basic Statistics for the Behav.
Sci (2nd), p.195.

Usage:   lpearsonr(x,y)      where x and y are equal-length lists
Returns: Pearson's r value, two-tailed p-value
"""
    TINY = 1.0e-30
    if len(x) != len(y):
        raise ValueError('Input values not paired in pearsonr.  Aborting.',x,y)
    n = len(x)
    x = list(map(float,x))
    y = list(map(float,y))
    xmean = central_tendency.mean(x)
    ymean = central_tendency.mean(y)
    r_num = n*(support.summult(x,y)) - sum(x)*sum(y)
    r_den = sqrt((n*support.ss(x) - support.square_of_sums(x))*(n*support.ss(y)-support.square_of_sums(y)))
    r = (r_num / r_den)  # denominator already a float
    df = n-2
    t = r*sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
    prob = probability.betai(0.5*df,0.5,df/float(df+t*t))
    return r, prob
def ranksums(x: List(float), y: List(float)) -> (float, float):
    """
Calculates the rank sums statistic on the provided scores and
returns the result.  Use only when the n in each condition is > 20 and you
have 2 independent samples of ranks.

Usage:   lranksums(x,y)
Returns: a z-statistic, two-tailed p-value
"""
    n1 = len(x)
    n2 = len(y)
    alldata = x + y
    ranked = support.rankdata(alldata)
    x = ranked[:n1]
    y = ranked[n1:]
    s = sum(x)
    expected = n1 * (n1 + n2 + 1) / 2.0
    _z = (s - expected) / sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0)
    prob = 2 * (1.0 - probability.zprob(abs(_z)))
    return _z, prob
Beispiel #15
0
def spearmanr(x:List(float),y:List(float))->(float,float):
    """
Calculates a Spearman rank-order correlation coefficient.  Taken
from Heiman's Basic Statistics for the Behav. Sci (1st), p.192.

Usage:   lspearmanr(x,y)      where x and y are equal-length lists
Returns: Spearman's r, two-tailed p-value
"""
    TINY = 1e-30
    if len(x) != len(y):
        raise ValueError('Input values not paired in spearmanr.  Aborting.')
    n = len(x)
    rankx = rankdata(x)
    ranky = rankdata(y)
    dsq = sumdiffsquared(rankx,ranky)
    rs = 1 - 6*dsq / float(n*(n**2-1))
    t = rs * sqrt((n-2) / ((rs+1.0)*(1.0-rs)))
    df = n-2
    probrs = probability.betai(0.5*df,0.5,df/(df+t*t))  # t already a float
# probability values for rs are from part 2 of the spearman function in
# Numerical Recipies, p.510.  They are close to tables, but not exact. (?)
    return rs, probrs
def ks_2samp(data1: List(float), data2: List(float)) -> (float, float):
    """
Computes the Kolmogorov-Smirnof statistic on 2 samples.  From
Numerical Recipies in C, page 493.

Usage:   lks_2samp(data1,data2)   data1&2 are lists of values for 2 conditions
Returns: KS D-value, associated p-value
"""
    j1 = 0
    j2 = 0
    fn1 = 0.0
    fn2 = 0.0
    n1 = len(data1)
    n2 = len(data2)
    en1 = n1
    en2 = n2
    d = 0.0
    data1.sort()
    data2.sort()
    while j1 < n1 and j2 < n2:
        d1 = data1[j1]
        d2 = data2[j2]
        if d1 <= d2:
            fn1 = (j1) / float(en1)
            j1 = j1 + 1
        if d2 <= d1:
            fn2 = (j2) / float(en2)
            j2 = j2 + 1
        dt = (fn2 - fn1)
        if fabs(dt) > fabs(d):
            d = dt
    try:
        en = sqrt(en1 * en2 / float(en1 + en2))
        prob = ksprob((en + 0.12 + 0.11 / en) * abs(d))
    except:
        prob = 1.0
    return d, prob
def ttest_rel(a: List(float), b: List(float)) -> (float, float):
    """
Calculates the t-obtained T-test on TWO RELATED samples of scores,
a and b.  From Numerical Recipies, p.483.  If printit=1, results are
printed to the screen.  If printit='filename', the results are output to
'filename' using the given writemode (default=append).  Returns t-value,
and prob.

Usage:   lttest_rel(a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a')
Returns: t-value, two-tailed prob
"""
    printit = 0
    name1 = 'Sample1'
    name2 = 'Sample2'
    writemode = 'a'  #bg: optional arg
    if len(a) != len(b):
        raise ValueError('Unequal length lists in ttest_rel.')
    x1 = central_tendency.mean(a)
    x2 = central_tendency.mean(b)
    v1 = variability.var(a)
    v2 = variability.var(b)
    n = len(a)
    cov = 0
    for i in range(len(a)):
        cov = cov + (a[i] - x1) * (b[i] - x2)
    df = n - 1
    _cov = cov / float(df)
    sd = sqrt((v1 + v2 - 2.0 * _cov) / float(n))
    t = (x1 - x2) / sd
    prob = probability.betai(0.5 * df, 0.5, df / (df + t * t))

    if printit != 0:
        statname = 'Related samples T-test.'
        outputpairedstats(printit, writemode, name1, n, x1, v1, min(a), max(a),
                          name2, n, x2, v2, min(b), max(b), statname, t, prob)
    return t, prob