def Corr(xs, ys): """ 计算皮尔逊相关系数 (协方差--->标准分数) 皮尔逊相关系数对异常值的影响很敏感 """ cov = correlation.Cov(xs, ys) _, xs_var = thinkstats.MeanVar(xs) _, ys_var = thinkstats.MeanVar(ys) return cov / (math.sqrt(xs_var * ys_var))
def CoefDetermination(ys, res): """Computes the coefficient of determination (R^2) for given residuals. Args: ys: dependent variable res: residuals Returns: float coefficient of determination """ ybar, vary = thinkstats.MeanVar(ys) resbar, varres = thinkstats.MeanVar(res) return 1 - varres / vary
def ProcessScoresPairwise(pairs): """Average number of goals for each team against each opponent. pairs: map from (team1, team2) to (score1, score2) """ # map from (team1, team2) to list of goals scored goals_scored = {} for key, entries in pairs.iteritems(): t1, t2 = key for entry in entries: g1, g2 = entry goals_scored.setdefault((t1, t2), []).append(g1) goals_scored.setdefault((t2, t1), []).append(g2) # make a list of average goals scored lams = [] for key, goals in goals_scored.iteritems(): if len(goals) < 3: continue lam = thinkstats.Mean(goals) lams.append(lam) # make the distribution of average goals scored cdf = thinkbayes.MakeCdfFromList(lams) thinkplot.Cdf(cdf) thinkplot.Show() mu, var = thinkstats.MeanVar(lams) print 'mu, sig', mu, math.sqrt(var) print 'BOS v VAN', pairs['BOS', 'VAN']
def ProcessScoresTeamwise(pairs): """Average number of goals for each team. pairs: map from (team1, team2) to (score1, score2) """ # map from team to list of goals scored goals_scored = {} for key, entries in pairs.iteritems(): t1, t2 = key for entry in entries: g1, g2 = entry goals_scored.setdefault(t1, []).append(g1) goals_scored.setdefault(t2, []).append(g2) # make a list of average goals scored lams = [] for key, goals in goals_scored.iteritems(): lam = thinkstats.Mean(goals) lams.append(lam) # make the distribution of average goals scored cdf = thinkbayes.MakeCdfFromList(lams) thinkplot.Cdf(cdf) thinkplot.Show() mu, var = thinkstats.MeanVar(lams) print('mu, sig', mu, math.sqrt(var))
def process(data): # Hist 分布图 hist = Pmf.MakeHistFromList(data, name='hist') myplot.Hist(hist, color='blue') myplot.Show() # Pmf 分布图 pmf = Pmf.MakePmfFromHist(hist, name='pmf') myplot.Pmf(pmf, color='yellow') myplot.Show() myplot.Clf() # 实际数据的CDF分布图 cdf = Cdf.MakeCdfFromList(data, name='loafs') myplot.Cdf(cdf) mu, var = thinkstats.MeanVar(data) sigma = math.sqrt(var) print("mu = %.3f, sigma = %.3f" % (mu, sigma)) # 正态分布 xs = normal_sample(len(data), mu, sigma) # xs = data ys = [erf.NormalCdf(x, mu=mu, sigma=sigma) for x in xs] myplot.Scatter(xs, ys, color='red', label='sample') myplot.Show()
def Corr(xs, ys): """Computes Cov(X, Y). Args: xs: seqeuence of values ys: seqeuence of values Returns: Corr(X, Y) """ xbar, varx = thinkstats.MeanVar(xs) ybar, vary = thinkstats.MeanVar(ys) corr = Cov(xs, ys, xbar, ybar) / math.sqrt(varx * vary) return corr
def PValue(model1, model2, n, m, delta, iters=1000): """Computes the distribution of deltas with the model distributions. And the p-value of the observed delta. Args: model1: model2: sequences of values from the hypothetical distributions n: sample size from model1 m: sample size from model2 delta: the observed difference in the means iters: how many samples to generate """ deltas = [Resample(model1, model2, n, m) for i in range(iters)] mean_var = thinkstats.MeanVar(deltas) print('(Mean, Var) of resampled deltas', mean_var) cdf = Cdf.MakeCdfFromList(deltas) # compute the two tail probabilities left = cdf.Prob(-delta) right = 1.0 - cdf.Prob(delta) pvalue = left + right print('Tails (left, right, total):', left, right, left + right) return cdf, pvalue
def MakeUniformPrior(t, num_points, label, spread=3.0): """Makes a prior distribution for mu and sigma based on a sample. t: sample num_points: number of values in each dimension label: string label for the new Pmf spread: number of standard errors to include Returns: Pmf that maps from (mu, sigma) to prob. """ # estimate mean and stddev of t n = len(t) xbar, S2 = thinkstats.MeanVar(t) sighat = math.sqrt(S2) print xbar, sighat, sighat / xbar # compute standard error for mu and the range of ms stderr_xbar = sighat / math.sqrt(n) mspread = spread * stderr_xbar ms = numpy.linspace(xbar - mspread, xbar + mspread, num_points) # compute standard error for sigma and the range of ss stderr_sighat = sighat / math.sqrt(2 * (n - 1)) sspread = spread * stderr_sighat ss = numpy.linspace(sighat - sspread, sighat + sspread, num_points) # populate the PMF pmf = Pmf.Pmf(name=label) for m in ms: for s in ss: pmf.Set((m, s), 1) return ms, ss, pmf
def MakeErrorModel(model, ys, ts, n=100): """Makes a model that captures sample error and residual error. model: string representation of the regression model ys: dependent variable ts: explanatory variable n: number of simulations to run Returns a pair of models, where each model is a pair of rows. """ # estimate mean and stddev of the residuals residuals = Residuals(model, ys, ts) mu, var = thinkstats.MeanVar(residuals) sig = math.sqrt(var) # make the best fit fts, fys = MakeFit(model, ys, ts) # resample residuals and generate hypothetical fits fits = [] for i in range(n): fake_ys = [fy + random.gauss(mu, sig) for fy in fys[:-1]] _, fake_fys = MakeFit(model, fake_ys, ts) fits.append(fake_fys) # find the 90% CI in each column columns = zip(*fits) sample_error = MakeStddev(columns) total_error = MakeStddev(columns, mu, var) return fts, sample_error, total_error
def LeastSquares(xs, ys): """Computes a linear least squares fit for ys as a function of xs. Args: xs: sequence of values ys: sequence of values Returns: tuple of (intercept, slope) """ xbar, varx = thinkstats.MeanVar(xs) ybar, vary = thinkstats.MeanVar(ys) slope = Cov(xs, ys, xbar, ybar) / varx inter = ybar - slope * xbar return inter, slope
def Process(table): '''返回 table 的分析结果 参数: table 对象 ''' # 遍历数据表的所有记录,读取其中的妊娠期字段 prglength table.lengths = [p.prglength for p in table.records] table.n = len(table.records) table.mu = Mean(table.lengths) mv = thinkstats.MeanVar(table.lengths) table.sd = math.sqrt(mv[1])
def accident_cdf(col_dict, attr='accidents'): """Makes a CDF of the number of accidents per day. col_dict: map from attribute name to column of data attr: string attribute name, one of accidents, injury, fatal """ accidents = col_dict[attr] cdf = Cdf.MakeCdfFromList(accidents) print thinkstats.MeanVar(accidents) return cdf
def MakeStddev(columns, mu2=0, var2=0): """Finds a confidence interval for each column. Returns two rows: the low end of the intervals and the high ends. """ stats = [thinkstats.MeanVar(ys) for ys in columns] min_fys = [mu1 + mu2 - 2 * math.sqrt(var1 + var2) for mu1, var1 in stats] max_fys = [mu1 + mu2 + 2 * math.sqrt(var1 + var2) for mu1, var1 in stats] return min_fys, max_fys
def Test(actual1, actual2, model, iters=1000): """Estimates p-values based on differences in the mean. Args: actual1: actual2: sequences of observed values for groups 1 and 2 model: sequences of values from the hypothetical distribution """ n = len(actual1) m = len(actual2) # compute delta mu1, mu2, delta = hypothesis.DifferenceInMean(actual1, actual2) delta = abs(delta) print('n:', n) print('m:', m) print('mu1', mu1) print('mu2', mu2) print('delta', delta) # compute the expected distribution of differences in sample mean mu_pooled, var_pooled = thinkstats.MeanVar(model) print('(Mean, Var) of pooled data', mu_pooled, var_pooled) f = 1.0 / n + 1.0 / m mu, var = (0, f * var_pooled) print('Expected Mean, Var of deltas', mu, var) # compute the p-value of delta in the observed distribution sigma = math.sqrt(var) left = erf.NormalCdf(-delta, mu, sigma) right = 1 - erf.NormalCdf(delta, mu, sigma) pvalue = left + right print('Tails:', left, right) print('p-value:', pvalue) # compare the mean and variance of resamples differences deltas = [hypothesis.Resample(model, model, n, m) for i in range(iters)] mean_var = thinkstats.MeanVar(deltas) print('(Mean, Var) of resampled deltas', mean_var) return pvalue
def Corr(xs, ys): """Computes Corr(X, Y). Args: xs: sequence of values ys: sequence of values Returns: Corr(X, Y) """ xbar, varx = thinkstats.MeanVar(xs) ybar, vary = thinkstats.MeanVar(ys) try: corr = Cov(xs, ys, xbar, ybar) / math.sqrt(varx * vary) except ZeroDivisionError as e: # print(xs, ys) return 0 return corr
def Q1(people, prob, year): # 模拟次数 simcount = 100 results = [] for c in range(simcount): results.append(process(people, prob, year)) mu, var = thinkstats.MeanVar(results) sigma = math.sqrt(var) print("Q1: mu = %.3f, var = %.3f, sigma = %.3f" % (mu, var, sigma)) return results
def Pumpkin(): '''习题2 - 1 计算南瓜重量的均值、方差和标准差。 :return: ''' tp = (1, 1, 1, 3, 3, 591) print('计算南瓜重量的均值、方差和标准差') mv = thinkstats.MeanVar(tp) print('均值 = ', mv[0]) print('方差 = ', mv[1]) σ = math.sqrt(mv[1]) print('标准差 = ', σ)
def PearsonSkewness(d): n = len(d) sorted(d) mean, var = thinkstats.MeanVar(d) sigma = math.sqrt(var) median = 0 if n % 2: median = d[n / 2 + 1] else: median = (d[int(n / 2)] + d[int((n + 1) / 2)]) / 2 print("median = %.3f", median) return 3 * (mean - median) / sigma
def main(): resp = brfss.Respondents() resp.ReadRecords(data_dir='res') heights, weights = resp.GetHeightAndWeight() c1 = Cov(heights, weights) c2 = Cov(heights, heights) _, var = thinkstats.MeanVar(heights) print(c1, c2, var) print("-------------- ") # 官方方法2 c3 = correlation.Cov(heights, weights) c4 = correlation.Cov(heights, heights) print(c3, c4, var)
def Summarize(data_dir): """Prints summary statistics for first babies and others. Returns: tuple of Tables """ table, firsts, others = MakeTables(data_dir) ProcessTables(firsts, others) # 標準偏差と平均値の出力 import thinkstats as ts birthlist = [] for p in firsts.records: birthlist.append(p.prglength) mean , variance = ts.MeanVar(birthlist) print "standard:first",variance**0.5,mean birthlist = [] for p in others.records: birthlist.append(p.prglength) mean , variance = ts.MeanVar(birthlist) print "standard:others",variance**0.5,mean # ここまで print 'Number of first babies', firsts.n print 'Number of others', others.n mu1, mu2 = firsts.mu, others.mu print 'Mean gestation in weeks:' print 'First babies', mu1 print 'Others', mu2 print 'Difference in days', (mu1 - mu2) * 7.0
def main(): random.seed(time.clock()) pool, firsts, others = cumulative.MakeTables() mean_var = thinkstats.MeanVar(pool.lengths) print('(Mean, Var) of pooled data', mean_var) print("--------------4/4 ") # NSFG原始数据 hypothesis.RunTest('length-4-4', pool.lengths, firsts.lengths, others.lengths, iters=1000, trim=False, partition=False) print("--------------3/4 ") # NSFG数据 3/4 hypothesis.RunTest('length-3-4', hypothesis.SampleWithoutReplacement(pool.lengths, int(pool.n * 0.75)), hypothesis.SampleWithoutReplacement(firsts.lengths, int(firsts.n * 0.75)), hypothesis.SampleWithoutReplacement(others.lengths, int(others.n * 0.75)), iters=1000, trim=False, partition=False) print("--------------2/4 ") # NSFG数据 1/2 hypothesis.RunTest('length-half-2-4', hypothesis.SampleWithoutReplacement(pool.lengths, int(pool.n * 0.5)), hypothesis.SampleWithoutReplacement(firsts.lengths, int(firsts.n * 0.5)), hypothesis.SampleWithoutReplacement(others.lengths, int(others.n * 0.5)), iters=1000, trim=False, partition=False) print("--------------1/4 ") # NSFG数据 1/4 hypothesis.RunTest('length-half-1-4', hypothesis.SampleWithoutReplacement(pool.lengths, int(pool.n * 0.25)), hypothesis.SampleWithoutReplacement(firsts.lengths, int(firsts.n * 0.25)), hypothesis.SampleWithoutReplacement(others.lengths, int(others.n * 0.25)), iters=1000, trim=False, partition=False)
def main(): random.seed(1) # get the data pool, firsts, others = cumulative.MakeTables() mean_var = thinkstats.MeanVar(pool.lengths) print('(Mean, Var) of pooled data', mean_var) # run the test RunTest('length', pool.lengths, firsts.lengths, others.lengths, iters=1000, trim=False, partition=False)
def main(): firsts, others, babies = Babies.PartitionBabies() if p == 0: firsts_wtlist = Babies.GetWightList(firsts) others_wtlist = Babies.GetWightList(others) babies_wtlist = Babies.GetWightList(babies) else: firsts_wtlist = Babies.GetPregnacyList(firsts) others_wtlist = Babies.GetPregnacyList(others) babies_wtlist = Babies.GetPregnacyList(babies) print('(Mean, Var) of babies data', thinkstats.MeanVar(babies_wtlist)) # P(E|H0) peh0 = hypothesis.Test("peh0", firsts_wtlist, others_wtlist, babies_wtlist, babies_wtlist, iters=1000, plot=False) # P(E|Ha) peha = hypothesis.Test("peh0", firsts_wtlist, others_wtlist, firsts_wtlist, others_wtlist, iters=1000, plot=False) # P(HA) pha = 0.5 # P(E) pe = peha * pha + peh0 * (1 - pha) # P(Ha|E) phae = (peha * pha) / pe print("pha = %.3f, peh0 = %.3f, peha = %.3f, phae = %.3f" % (pha, peh0, peha, phae))
def Pumpkin(data): mu, var = thinkstats.MeanVar(data) svar = math.sqrt(var) print("Pumpkin mean:{} variance:{} normal variance:{}".format( mu, var, svar))
def standVar(first, others): import thinkstats as stats import math fmu, fvar = stats.MeanVar(first) omu, ovar = stats.MeanVar(others) return math.sqrt(fvar), math.sqrt(ovar), fmu, omu
def Summarize(srcs): """Computes the number of edges for each source.""" lens = [len(t) for t in srcs.values()] mu, sigma2 = thinkstats.MeanVar(lens) print(mu, math.sqrt(sigma2)) return lens
def Process(table): table.lengths = [p.prglength for p in table.records] table.n = len(table.lengths) table.mu, table.var = thinkstats.MeanVar(table.lengths) table.svar = math.sqrt(table.var)
def Pumpkin(): weights = [p.weight for p in pumpkins] mu, var = thinkstats.MeanVar(weights) s = math.sqrt(var) return (mu, var, s)
#!/usr/bin/python3 # -*- coding: utf-8 -*- import Pmf import thinkstats def PmfMean(pmf): mu = 0.0 for key, val in pmf.Items(): mu += key * val return mu def PmfVar(pmf, mu): var = 0.0 for key, val in pmf.Items(): var += val * (key - mu)**2 return var if __name__ == "__main__": data=(1, 1, 1, 3, 3, 591) mu, var = thinkstats.MeanVar(data) print("Mean:{} Var:{}".format(mu, var)) print("-------------- ") pmf = Pmf.MakePmfFromList(data, name='test') mu = PmfMean(pmf) var = PmfVar(pmf, mu) print("Mean:{} Var:{}".format(mu, var)) print("-------------- ") print("Mean:{} Var:{}".format(pmf.Mean(), pmf.Var(mu=None)))
print 'Number of pregnancies', len(pregnancies.records) print preg_lengths_first = [] preg_lengths_others = [] for preg in pregnancies.records: if preg.outcome != 1: continue if preg.birthord == 1: preg_lengths_first.append(preg.prglength) else: preg_lengths_others.append(preg.prglength) pregs_first = len(preg_lengths_first) mean_length_first, var_length_first = thinkstats.MeanVar(preg_lengths_first) std_length_first = math.sqrt(var_length_first) pregs_others = len(preg_lengths_others) mean_length_others, var_length_others = thinkstats.MeanVar(preg_lengths_others) std_length_others = math.sqrt(var_length_others) print 'Number of live births, first child', pregs_first print 'Mean pregnancy length (weeks), first child', mean_length_first print 'Variance of gestation time, first child', var_length_first print 'Standard deviation of gestation time, first child', std_length_first print print 'Number of live births, other children', pregs_others print 'Mean pregnancy length (weeks), other children', mean_length_others print 'Variance of gestation time, other children', var_length_others print 'Standard deviation of gestation time, other children', std_length_others