def linregress(x:List(float),y:List(float))->(float,float,float,float,float): """ Calculates a regression line on x,y pairs. Usage: llinregress(x,y) x,y are equal-length lists of x-y coordinates Returns: slope, intercept, r, two-tailed prob, sterr-of-estimate """ TINY = 1.0e-20 if len(x) != len(y): raise ValueError('Input values not paired in linregress. Aborting.') n = len(x) x = list(map(float,x)) y = list(map(float,y)) xmean = central_tendency.mean(x) ymean = central_tendency.mean(y) r_num = float(n*(support.summult(x,y)) - sum(x)*sum(y)) r_den = sqrt((n*support.ss(x) - support.square_of_sums(x))*(n*support.ss(y)-support.square_of_sums(y))) r = r_num / r_den z = 0.5*log((1.0+r+TINY)/(1.0-r+TINY)) df = n-2 t = r*sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) prob = probability.betai(0.5*df,0.5,df/(df+t*t)) slope = r_num / float(n*support.ss(x) - support.square_of_sums(x)) intercept = ymean - slope*xmean sterrest = sqrt(1-r*r)*variability.samplestdev(y) return slope, intercept, r, prob, sterrest
def kendalltau(x:List(float),y:List(float))->(float,float): """ Calculates Kendall's tau ... correlation of ordinal data. Adapted from function kendl1 in Numerical Recipies. Needs good test-routine.@@@ Usage: lkendalltau(x,y) Returns: Kendall's tau, two-tailed p-value """ n1 = 0 n2 = 0 iss = 0 for j in range(len(x)-1): for k in range(j,len(y)): a1 = x[j] - x[k] a2 = y[j] - y[k] aa = a1 * a2 if (aa): # neither list has a tie n1 = n1 + 1 n2 = n2 + 1 if aa > 0: iss = iss + 1 else: iss = iss -1 else: if (a1): n1 = n1 + 1 else: n2 = n2 + 1 tau = iss / sqrt(n1*n2) svar = (4.0*len(x)+10.0) / (9.0*len(x)*(len(x)-1)) _z = tau / sqrt(svar) prob = probability.erfcc(abs(_z)/1.4142136) return tau, prob
def mannwhitneyu(x: List(float), y: List(float)) -> (float, float): """ Calculates a Mann-Whitney U statistic on the provided scores and returns the result. Use only when the n in each condition is < 20 and you have 2 independent samples of ranks. NOTE: Mann-Whitney U is significant if the u-obtained is LESS THAN or equal to the critical value of U found in the tables. Equivalent to Kruskal-Wallis H with just 2 groups. Usage: lmannwhitneyu(data) Returns: u-statistic, one-tailed p-value (i.e., p(z(U))) """ n1 = len(x) n2 = len(y) ranked = support.rankdata(x + y) rankx = ranked[0:n1] # get the x-ranks ranky = ranked[n1:] # the rest are y-ranks u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - sum(rankx) # calc U for x u2 = n1 * n2 - u1 # remainder is U for y bigu = max(u1, u2) smallu = min(u1, u2) proportion = bigu / float(n1 * n2) T = sqrt(tiecorrect(ranked)) # correction factor for tied scores if T == 0: raise ValueError('All numbers are identical in lmannwhitneyu') sd = sqrt(T * n1 * n2 * (n1 + n2 + 1) / 12.0) z = abs((bigu - n1 * n2 / 2.0) / sd) # normal approximation for prob calc return smallu, 1.0 - probability.zprob(z) #, proportion
def pointbiserialr(x:List(float),y:List(float))->(float,float): """ Calculates a point-biserial correlation coefficient and the associated probability value. Taken from Heiman's Basic Statistics for the Behav. Sci (1st), p.194. Usage: lpointbiserialr(x,y) where x,y are equal-length lists Returns: Point-biserial r, two-tailed p-value """ TINY = 1e-30 if len(x) != len(y): raise ValueError('INPUT VALUES NOT PAIRED IN pointbiserialr. ABORTING.') data = pstat.abut(x,y) categories = pstat.unique(x) if len(categories) != 2: raise ValueError("Exactly 2 categories required for pointbiserialr().") else: # there are 2 categories, continue codemap = pstat.abut(categories,list(range(2))) recoded = pstat.recode(data,codemap,0) _x = pstat.linexand(data,0,categories[0]) _y = pstat.linexand(data,0,categories[1]) xmean = central_tendency.mean(pstat.colex(_x,1)) ymean = central_tendency.mean(pstat.colex(_y,1)) n = len(data) adjust = sqrt((len(_x)/float(n))*(len(_y)/float(n))) rpb = (ymean - xmean)/variability.samplestdev(pstat.colex(data,1))*adjust df = n-2 t = rpb*sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY))) prob = probability.betai(0.5*df,0.5,df/(df+t*t)) # t already a float return rpb, prob
def chisqprob(chisq:float,df:int)->float: """ Returns the (1-tailed) probability value associated with the provided chi-square value and df. Adapted from chisq.c in Gary Perlman's |Stat. Usage: lchisqprob(chisq,df) """ BIG = 20.0 if chisq <=0 or df < 1: return 1.0 a = 0.5 * chisq if df%2 == 0: even = 1 else: even = 0 if df > 1: y = ex(-a,BIG) if even: s = y else: s = 2.0 * zprob(-sqrt(chisq)) if (df > 2): chisq = 0.5 * (df - 1.0) if even: _z = 1.0 else: _z = 0.5 if a > BIG: if even: e = 0.0 else: e = log(sqrt(pi)) c = log(a) while (_z <= chisq): e = log(_z) + e s = s + ex(c*_z-a-e,BIG) _z = _z + 1.0 return s else: if even: e = 1.0 else: e = 1.0 / sqrt(pi) / sqrt(a) c = 0.0 while (_z <= chisq): e = e * (a/float(_z)) c = c + e _z = _z + 1.0 return (c*y+s) else: return s
def ttest_ind(a: List(float), b: List(float)) -> (float, float): """ Calculates the t-obtained T-test on TWO INDEPENDENT samples of scores a, and b. From Numerical Recipies, p.483. If printit=1, results are printed to the screen. If printit='filename', the results are output to 'filename' using the given writemode (default=append). Returns t-value, and prob. Usage: lttest_ind(a,b,printit=0,name1='Samp1',name2='Samp2',writemode='a') Returns: t-value, two-tailed prob """ printit = 0 name1 = 'Samp1' name2 = 'Samp2' writemode = 'a' #bg: optional args x1 = central_tendency.mean(a) x2 = central_tendency.mean(b) v1 = variability.stdev(a)**2 v2 = variability.stdev(b)**2 n1 = len(a) n2 = len(b) df = n1 + n2 - 2 svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df) t = (x1 - x2) / sqrt(svar * (1.0 / n1 + 1.0 / n2)) prob = probability.betai(0.5 * df, 0.5, df / (df + t * t)) if printit != 0: statname = 'Independent samples T-test.' outputpairedstats(printit, writemode, name1, n1, x1, v1, min(a), max(a), name2, n2, x2, v2, min(b), max(b), statname, t, prob) return t, prob
def ttest_1samp(a: List(float), popmean: int) -> (float, float): """ Calculates the t-obtained for the independent samples T-test on ONE group of scores a, given a population mean. If printit=1, results are printed to the screen. If printit='filename', the results are output to 'filename' using the given writemode (default=append). Returns t-value, and prob. Usage: lttest_1samp(a,popmean,Name='Sample',printit=0,writemode='a') Returns: t-value, two-tailed prob """ printit = 0 #bg: optional arg name = 'Sample' #bg: optional arg writemode = 'a' #bg: optional arg x = central_tendency.mean(a) v = variability.var(a) n = len(a) df = n - 1 svar = ((n - 1) * v) / float(df) t = (x - popmean) / sqrt(svar * (1.0 / n)) prob = probability.betai(0.5 * df, 0.5, float(df) / (df + t * t)) if printit != 0: statname = 'Single-sample T-test.' outputpairedstats(printit, writemode, 'Population', '--', popmean, 0, 0, 0, name, n, x, v, min(a), max(a), statname, t, prob) return t, prob
def wilcoxont(x: List(float), y: List(float)) -> (float, float): """ Calculates the Wilcoxon T-test for related samples and returns the result. A non-parametric T-test. Usage: lwilcoxont(x,y) Returns: a t-statistic, two-tail probability estimate """ if len(x) != len(y): raise ValueError('Unequal N in wilcoxont. Aborting.') d = [] for i in range(len(x)): diff = x[i] - y[i] if diff != 0: d.append(diff) count = len(d) absd = list(map(abs, d)) absranked = support.rankdata(absd) r_plus = 0.0 r_minus = 0.0 for i in range(len(absd)): if d[i] < 0: r_minus = r_minus + absranked[i] else: r_plus = r_plus + absranked[i] wt = min(r_plus, r_minus) mn = count * (count + 1) * 0.25 se = sqrt(count * (count + 1) * (2.0 * count + 1.0) / 24.0) _z = fabs(wt - mn) / se prob = 2 * (1.0 - probability.zprob(abs(_z))) return wt, prob
def stdev(inlist: List(float)) -> float: """ Returns the standard deviation of the values in the passed list using N-1 in the denominator (i.e., to estimate population stdev). Usage: lstdev(inlist) """ return sqrt(var(inlist))
def sterr(inlist: List(float)) -> float: """ Returns the standard error of the values in the passed list using N-1 in the denominator (i.e., to estimate population standard error). Usage: lsterr(inlist) """ return stdev(inlist) / float(sqrt(len(inlist)))
def samplestdev(inlist: List(float)) -> float: """ Returns the standard deviation of the values in the passed list using N for the denominator (i.e., DESCRIBES the sample stdev only). Usage: lsamplestdev(inlist) """ return sqrt(samplevar(inlist))
def sem(inlist: List(float)) -> float: """ Returns the estimated standard error of the mean (sx-bar) of the values in the passed list. sem = stdev / sqrt(n) Usage: lsem(inlist) """ sd = stdev(inlist) n = len(inlist) return sd / sqrt(n)
def pearsonr(x:List(float),y:List(float))->(float,float): """ Calculates a Pearson correlation coefficient and the associated probability value. Taken from Heiman's Basic Statistics for the Behav. Sci (2nd), p.195. Usage: lpearsonr(x,y) where x and y are equal-length lists Returns: Pearson's r value, two-tailed p-value """ TINY = 1.0e-30 if len(x) != len(y): raise ValueError('Input values not paired in pearsonr. Aborting.',x,y) n = len(x) x = list(map(float,x)) y = list(map(float,y)) xmean = central_tendency.mean(x) ymean = central_tendency.mean(y) r_num = n*(support.summult(x,y)) - sum(x)*sum(y) r_den = sqrt((n*support.ss(x) - support.square_of_sums(x))*(n*support.ss(y)-support.square_of_sums(y))) r = (r_num / r_den) # denominator already a float df = n-2 t = r*sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) prob = probability.betai(0.5*df,0.5,df/float(df+t*t)) return r, prob
def ranksums(x: List(float), y: List(float)) -> (float, float): """ Calculates the rank sums statistic on the provided scores and returns the result. Use only when the n in each condition is > 20 and you have 2 independent samples of ranks. Usage: lranksums(x,y) Returns: a z-statistic, two-tailed p-value """ n1 = len(x) n2 = len(y) alldata = x + y ranked = support.rankdata(alldata) x = ranked[:n1] y = ranked[n1:] s = sum(x) expected = n1 * (n1 + n2 + 1) / 2.0 _z = (s - expected) / sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0) prob = 2 * (1.0 - probability.zprob(abs(_z))) return _z, prob
def spearmanr(x:List(float),y:List(float))->(float,float): """ Calculates a Spearman rank-order correlation coefficient. Taken from Heiman's Basic Statistics for the Behav. Sci (1st), p.192. Usage: lspearmanr(x,y) where x and y are equal-length lists Returns: Spearman's r, two-tailed p-value """ TINY = 1e-30 if len(x) != len(y): raise ValueError('Input values not paired in spearmanr. Aborting.') n = len(x) rankx = rankdata(x) ranky = rankdata(y) dsq = sumdiffsquared(rankx,ranky) rs = 1 - 6*dsq / float(n*(n**2-1)) t = rs * sqrt((n-2) / ((rs+1.0)*(1.0-rs))) df = n-2 probrs = probability.betai(0.5*df,0.5,df/(df+t*t)) # t already a float # probability values for rs are from part 2 of the spearman function in # Numerical Recipies, p.510. They are close to tables, but not exact. (?) return rs, probrs
def ks_2samp(data1: List(float), data2: List(float)) -> (float, float): """ Computes the Kolmogorov-Smirnof statistic on 2 samples. From Numerical Recipies in C, page 493. Usage: lks_2samp(data1,data2) data1&2 are lists of values for 2 conditions Returns: KS D-value, associated p-value """ j1 = 0 j2 = 0 fn1 = 0.0 fn2 = 0.0 n1 = len(data1) n2 = len(data2) en1 = n1 en2 = n2 d = 0.0 data1.sort() data2.sort() while j1 < n1 and j2 < n2: d1 = data1[j1] d2 = data2[j2] if d1 <= d2: fn1 = (j1) / float(en1) j1 = j1 + 1 if d2 <= d1: fn2 = (j2) / float(en2) j2 = j2 + 1 dt = (fn2 - fn1) if fabs(dt) > fabs(d): d = dt try: en = sqrt(en1 * en2 / float(en1 + en2)) prob = ksprob((en + 0.12 + 0.11 / en) * abs(d)) except: prob = 1.0 return d, prob
def ttest_rel(a: List(float), b: List(float)) -> (float, float): """ Calculates the t-obtained T-test on TWO RELATED samples of scores, a and b. From Numerical Recipies, p.483. If printit=1, results are printed to the screen. If printit='filename', the results are output to 'filename' using the given writemode (default=append). Returns t-value, and prob. Usage: lttest_rel(a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a') Returns: t-value, two-tailed prob """ printit = 0 name1 = 'Sample1' name2 = 'Sample2' writemode = 'a' #bg: optional arg if len(a) != len(b): raise ValueError('Unequal length lists in ttest_rel.') x1 = central_tendency.mean(a) x2 = central_tendency.mean(b) v1 = variability.var(a) v2 = variability.var(b) n = len(a) cov = 0 for i in range(len(a)): cov = cov + (a[i] - x1) * (b[i] - x2) df = n - 1 _cov = cov / float(df) sd = sqrt((v1 + v2 - 2.0 * _cov) / float(n)) t = (x1 - x2) / sd prob = probability.betai(0.5 * df, 0.5, df / (df + t * t)) if printit != 0: statname = 'Related samples T-test.' outputpairedstats(printit, writemode, name1, n, x1, v1, min(a), max(a), name2, n, x2, v2, min(b), max(b), statname, t, prob) return t, prob