def pointbiserialr(x:List(float),y:List(float))->(float,float): """ Calculates a point-biserial correlation coefficient and the associated probability value. Taken from Heiman's Basic Statistics for the Behav. Sci (1st), p.194. Usage: lpointbiserialr(x,y) where x,y are equal-length lists Returns: Point-biserial r, two-tailed p-value """ TINY = 1e-30 if len(x) != len(y): raise ValueError('INPUT VALUES NOT PAIRED IN pointbiserialr. ABORTING.') data = pstat.abut(x,y) categories = pstat.unique(x) if len(categories) != 2: raise ValueError("Exactly 2 categories required for pointbiserialr().") else: # there are 2 categories, continue codemap = pstat.abut(categories,list(range(2))) recoded = pstat.recode(data,codemap,0) _x = pstat.linexand(data,0,categories[0]) _y = pstat.linexand(data,0,categories[1]) xmean = central_tendency.mean(pstat.colex(_x,1)) ymean = central_tendency.mean(pstat.colex(_y,1)) n = len(data) adjust = sqrt((len(_x)/float(n))*(len(_y)/float(n))) rpb = (ymean - xmean)/variability.samplestdev(pstat.colex(data,1))*adjust df = n-2 t = rpb*sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY))) prob = probability.betai(0.5*df,0.5,df/(df+t*t)) # t already a float return rpb, prob
def linregress(x:List(float),y:List(float))->(float,float,float,float,float): """ Calculates a regression line on x,y pairs. Usage: llinregress(x,y) x,y are equal-length lists of x-y coordinates Returns: slope, intercept, r, two-tailed prob, sterr-of-estimate """ TINY = 1.0e-20 if len(x) != len(y): raise ValueError('Input values not paired in linregress. Aborting.') n = len(x) x = list(map(float,x)) y = list(map(float,y)) xmean = central_tendency.mean(x) ymean = central_tendency.mean(y) r_num = float(n*(support.summult(x,y)) - sum(x)*sum(y)) r_den = sqrt((n*support.ss(x) - support.square_of_sums(x))*(n*support.ss(y)-support.square_of_sums(y))) r = r_num / r_den z = 0.5*log((1.0+r+TINY)/(1.0-r+TINY)) df = n-2 t = r*sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) prob = probability.betai(0.5*df,0.5,df/(df+t*t)) slope = r_num / float(n*support.ss(x) - support.square_of_sums(x)) intercept = ymean - slope*xmean sterrest = sqrt(1-r*r)*variability.samplestdev(y) return slope, intercept, r, prob, sterrest
def obrientransform(args: List(List(float))) -> List(List(float)): """ Computes a transform on input data (any number of columns). Used to test for homogeneity of variance prior to running one-way stats. From Maxwell and Delaney, p.112. Usage: lobrientransform(*args) Returns: transformed data for use in an ANOVA """ TINY = 1e-10 k = len(args) n = [0] * k v = [0.0] * k m = [0.0] * k nargs = [] for i in range(k): nargs.append(copy.deepcopy(args[i])) n[i] = len(nargs[i]) v[i] = var([float(na) for na in nargs[i]]) m[i] = central_tendency.mean([float(na) for na in nargs[i]]) for j in range(k): for i in range(n[j]): t1 = (n[j] - 1.5) * n[j] * (nargs[j][i] - m[j])**2 t2 = 0.5 * v[j] * (n[j] - 1.0) t3 = (n[j] - 1.0) * (n[j] - 2.0) nargs[j][i] = (t1 - t2) / float(t3) check = 1 for j in range(k): if v[j] - central_tendency.mean(nargs[j]) > TINY: check = 0 if check != 1: raise ValueError('Problem in obrientransform.') else: return nargs
def ttest_ind(a: List(float), b: List(float)) -> (float, float): """ Calculates the t-obtained T-test on TWO INDEPENDENT samples of scores a, and b. From Numerical Recipies, p.483. If printit=1, results are printed to the screen. If printit='filename', the results are output to 'filename' using the given writemode (default=append). Returns t-value, and prob. Usage: lttest_ind(a,b,printit=0,name1='Samp1',name2='Samp2',writemode='a') Returns: t-value, two-tailed prob """ printit = 0 name1 = 'Samp1' name2 = 'Samp2' writemode = 'a' #bg: optional args x1 = central_tendency.mean(a) x2 = central_tendency.mean(b) v1 = variability.stdev(a)**2 v2 = variability.stdev(b)**2 n1 = len(a) n2 = len(b) df = n1 + n2 - 2 svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df) t = (x1 - x2) / sqrt(svar * (1.0 / n1 + 1.0 / n2)) prob = probability.betai(0.5 * df, 0.5, df / (df + t * t)) if printit != 0: statname = 'Independent samples T-test.' outputpairedstats(printit, writemode, name1, n1, x1, v1, min(a), max(a), name2, n2, x2, v2, min(b), max(b), statname, t, prob) return t, prob
def ttest_1samp(a: List(float), popmean: int) -> (float, float): """ Calculates the t-obtained for the independent samples T-test on ONE group of scores a, given a population mean. If printit=1, results are printed to the screen. If printit='filename', the results are output to 'filename' using the given writemode (default=append). Returns t-value, and prob. Usage: lttest_1samp(a,popmean,Name='Sample',printit=0,writemode='a') Returns: t-value, two-tailed prob """ printit = 0 #bg: optional arg name = 'Sample' #bg: optional arg writemode = 'a' #bg: optional arg x = central_tendency.mean(a) v = variability.var(a) n = len(a) df = n - 1 svar = ((n - 1) * v) / float(df) t = (x - popmean) / sqrt(svar * (1.0 / n)) prob = probability.betai(0.5 * df, 0.5, float(df) / (df + t * t)) if printit != 0: statname = 'Single-sample T-test.' outputpairedstats(printit, writemode, 'Population', '--', popmean, 0, 0, 0, name, n, x, v, min(a), max(a), statname, t, prob) return t, prob
def variation(inlist: List(float)) -> float: """ Returns the coefficient of variation, as defined in CRC Standard Probability and Statistics, p.6. Usage: lvariation(inlist) """ return 100.0 * variability.samplestdev(inlist) / float( central_tendency.mean(inlist))
def z(inlist: List(float), score: float) -> float: """ Returns the z-score for a given input score, given that score and the list from which that score came. Not appropriate for population calculations. Usage: lz(inlist, score) """ _z = (score - central_tendency.mean(inlist)) / samplestdev(inlist) return _z
def samplevar(inlist: List(float)) -> float: """ Returns the variance of the values in the passed list using N for the denominator (i.e., DESCRIBES the sample variance only). Usage: lsamplevar(inlist) """ n = len(inlist) mn = central_tendency.mean(inlist) deviations = [] for item in inlist: deviations.append(item - mn) return support.ss(deviations) / float(n)
def var(inlist: List(float)) -> float: """ Returns the variance of the values in the passed list using N-1 for the denominator (i.e., for estimating population variance). Usage: lvar(inlist) """ n = len(inlist) mn = central_tendency.mean(inlist) #bg#deviations = dyn([0]*len(inlist)) deviations = [0] * len(inlist) for i in range(len(inlist)): deviations[i] = inlist[i] - mn return support.ss(deviations) / float(n - 1)
def cov(x: List(float), y: List(float)) -> float: """ Returns the estimated covariance of the values in the passed array (i.e., N-1). Dimension can equal None (ravel array first), an integer (the dimension over which to operate), or a sequence (operate over multiple dimensions). Set keepdims=1 to return an array with the same number of dimensions as inarray. Usage: lcov(x,y,keepdims=0) """ keepdims = 0 #bg: was optional argument n = len(x) xmn = central_tendency.mean(x) ymn = central_tendency.mean(y) xdeviations = [0] * len(x) ydeviations = [0] * len(y) for i in range(len(x)): xdeviations[i] = x[i] - xmn ydeviations[i] = y[i] - ymn ss = 0.0 for i in range(len(xdeviations)): ss = ss + xdeviations[i] * ydeviations[i] return ss / float(n - 1)
def describe(inlist: List(float)) -> (int, (float, float), float, float, float, float): """ Returns some descriptive statistics of the passed list (assumed to be 1D). Usage: ldescribe(inlist) Returns: n, mean, standard deviation, skew, kurtosis """ n = len(inlist) mm = (min(inlist), max(inlist)) m = central_tendency.mean(inlist) sd = variability.stdev(inlist) sk = skew(inlist) kurt = kurtosis(inlist) return n, mm, m, sd, sk, kurt
def pearsonr(x:List(float),y:List(float))->(float,float): """ Calculates a Pearson correlation coefficient and the associated probability value. Taken from Heiman's Basic Statistics for the Behav. Sci (2nd), p.195. Usage: lpearsonr(x,y) where x and y are equal-length lists Returns: Pearson's r value, two-tailed p-value """ TINY = 1.0e-30 if len(x) != len(y): raise ValueError('Input values not paired in pearsonr. Aborting.',x,y) n = len(x) x = list(map(float,x)) y = list(map(float,y)) xmean = central_tendency.mean(x) ymean = central_tendency.mean(y) r_num = n*(support.summult(x,y)) - sum(x)*sum(y) r_den = sqrt((n*support.ss(x) - support.square_of_sums(x))*(n*support.ss(y)-support.square_of_sums(y))) r = (r_num / r_den) # denominator already a float df = n-2 t = r*sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) prob = probability.betai(0.5*df,0.5,df/float(df+t*t)) return r, prob
def ttest_rel(a: List(float), b: List(float)) -> (float, float): """ Calculates the t-obtained T-test on TWO RELATED samples of scores, a and b. From Numerical Recipies, p.483. If printit=1, results are printed to the screen. If printit='filename', the results are output to 'filename' using the given writemode (default=append). Returns t-value, and prob. Usage: lttest_rel(a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a') Returns: t-value, two-tailed prob """ printit = 0 name1 = 'Sample1' name2 = 'Sample2' writemode = 'a' #bg: optional arg if len(a) != len(b): raise ValueError('Unequal length lists in ttest_rel.') x1 = central_tendency.mean(a) x2 = central_tendency.mean(b) v1 = variability.var(a) v2 = variability.var(b) n = len(a) cov = 0 for i in range(len(a)): cov = cov + (a[i] - x1) * (b[i] - x2) df = n - 1 _cov = cov / float(df) sd = sqrt((v1 + v2 - 2.0 * _cov) / float(n)) t = (x1 - x2) / sd prob = probability.betai(0.5 * df, 0.5, df / (df + t * t)) if printit != 0: statname = 'Related samples T-test.' outputpairedstats(printit, writemode, name1, n, x1, v1, min(a), max(a), name2, n, x2, v2, min(b), max(b), statname, t, prob) return t, prob
def moment(inlist: List(float), moment: int) -> float: """ Calculates the nth moment about the mean for a sample (defaults to the 1st moment). Used to calculate coefficients of skewness and kurtosis. Usage: lmoment(inlist,moment=1) Returns: appropriate moment (r) from ... 1/n * SUM((inlist(i)-mean)**r) """ if moment == 1: return 0.0 else: mn = central_tendency.mean(inlist) n = len(inlist) s = 0 for x in inlist: s = s + (x - mn)**moment return s / float(n)
with t: l = list(map(float, range(1, LIST_SIZE))) lf = list(map(float, range(1, LIST_SIZE))) lf[2] = 3.0 ll = [l] * 5 print('\nCENTRAL TENDENCY') print('geometricmean:', central_tendency.geometricmean(l), central_tendency.geometricmean(lf), central_tendency.geometricmean(l), central_tendency.geometricmean(lf)) print('harmonicmean:', central_tendency.harmonicmean(l), central_tendency.harmonicmean(lf), central_tendency.harmonicmean(l), central_tendency.harmonicmean(lf)) print('mean:', central_tendency.mean(l), central_tendency.mean(lf), central_tendency.mean(l), central_tendency.mean(lf)) print('median:', central_tendency.median(l), central_tendency.median(lf), central_tendency.median(l), central_tendency.median(lf)) print('medianscore:', central_tendency.medianscore(l), central_tendency.medianscore(lf), central_tendency.medianscore(l), central_tendency.medianscore(lf)) print('mode:', central_tendency.mode(l), central_tendency.mode(l)) print('\nMOMENTS') print('moment:', moment.moment(l, 2), moment.moment(lf, 2), moment.moment(l, 2), moment.moment(lf, 2)) print('variation:', moment.variation(l), moment.variation(l), moment.variation(lf), moment.variation(lf)) print('skew:', moment.skew(l), moment.skew(lf), moment.skew(l), moment.skew(lf))
def testMeanResult(self): self.assertEqual(5.5, central_tendency.mean(sample1)) self.assertTrue((13.78 - central_tendency.mean(sample7)) < 0.01)
def testMeanResult(self): self.assertEqual(5.5, central_tendency.mean(sample1)) self.assertTrue((13.78 - central_tendency.mean(sample7)) < 0.01)