def SummarizeWeight(rows, input_limit=None): years = [ 1981, 1982, 1985, 1986, 1988, 1989, 1990, 1992, 1993, 1994, 1996, 1998, 2000, 2002, 2004, 2006, 2008 ] all_diffs = [] for i, row in enumerate(rows): if i == input_limit: break id, race, sex = row[:3] weights = row[3:] print id diffs = Differences(years, weights, jitter=3) all_diffs.extend(diffs) weights, changes = zip(*all_diffs) print 'Mean weight', thinkstats.Mean(weights) print 'Mean change', thinkstats.Mean(changes) print numpy.corrcoef(weights, changes) pyplot.hexbin(weights, changes, cmap=matplotlib.cm.gray_r) myplot.Save( 'nlsy_scatter', title='Weight change vs. weight', xlabel='Current weight (pounds)', ylabel='Weight change (pounds)', axis=[70, 270, -25, 25], legend=False, show=True, )
def main(): unchange_rate, change_rate = [], [] for i in range(1000): unchange, change = process() unchange_rate.append(unchange) change_rate.append(change) print("Mean(Unchange) = %.3f \t Mean(Change) = %.3f" % (thinkstats.Mean(unchange_rate), thinkstats.Mean(change_rate)))
def Process(table): """Runs analysis on the given table. Args: table: table object """ table.lengths = [p.prglength for p in table.records] table.n = len(table.lengths) table.mu = thinkstats.Mean(table.lengths) table.std = math.sqrt(thinkstats.Mean(table.lengths))
def Cov(xs, ys): """ 协方差, 算出来的值很难看, 单位也没意义. """ xn = len(xs) yn = len(ys) if xn != yn: return 0 x_mu = thinkstats.Mean(xs) y_mu = thinkstats.Mean(ys) t = [(x - x_mu) * (y - y_mu) for x, y in zip(xs, ys)] return sum(t) / xn
def DifferenceInMean(actual1, actual2): """Computes the difference in mean between two groups. Args: actual1: sequence of float actual2: sequence of float Returns: tuple of (mu1, mu2, mu1-mu2) """ mu1 = thinkstats.Mean(actual1) mu2 = thinkstats.Mean(actual2) delta = mu1 - mu2 return mu1, mu2, delta
def ex1_3_4(): first_list = [] other_list = [] for r in TABLE.records: if r.outcome == 1: if r.birthord == 1: first_list.append(r.prglength) else: other_list.append(r.prglength) return { 'avg_first': thinkstats.Mean(first_list), 'avg_other': thinkstats.Mean(other_list) }
def main(): simcount = 1000 # 模拟次数 counts = [1, 82] # 比赛场次1次和82次两种 for m in counts: results = [matches(m, 10, 15, 0.5) for i in range(simcount)] mu = thinkstats.Mean(results) print("matches %2d: prob = %.3f%% " % (m, mu * 100))
def ProcessScoresPairwise(pairs): """Average number of goals for each team against each opponent. pairs: map from (team1, team2) to (score1, score2) """ # map from (team1, team2) to list of goals scored goals_scored = {} for key, entries in pairs.iteritems(): t1, t2 = key for entry in entries: g1, g2 = entry goals_scored.setdefault((t1, t2), []).append(g1) goals_scored.setdefault((t2, t1), []).append(g2) # make a list of average goals scored lams = [] for key, goals in goals_scored.iteritems(): if len(goals) < 3: continue lam = thinkstats.Mean(goals) lams.append(lam) # make the distribution of average goals scored cdf = thinkbayes.MakeCdfFromList(lams) thinkplot.Cdf(cdf) thinkplot.Show() mu, var = thinkstats.MeanVar(lams) print 'mu, sig', mu, math.sqrt(var) print 'BOS v VAN', pairs['BOS', 'VAN']
def DifferenceInMeans(firsts, others, attr): """Compute the difference in means between tables for a given attr. Prints summary statistics. """ firsts_mean = thinkstats.Mean(getattr(firsts, attr)) print 'First babies, %s, trimmed mean:' % attr, firsts_mean others_mean = thinkstats.Mean(getattr(others, attr)) print 'Other babies, %s, trimmed mean:' % attr, others_mean diff = others_mean - firsts_mean print 'Difference in means:', diff print return diff
def RunFit(xs, ys): inter, slope, R2 = Fit(xs, ys) fxs = [min(xs), max(xs)] fys = [inter + slope * x for x in fxs] pyplot.plot(fxs, fys, 'r-', linewidth=2) print 'Mean diff', thinkstats.Mean(ys) print 'Current rate:', fys[-1]
def ProcessScoresTeamwise(pairs): """Average number of goals for each team. pairs: map from (team1, team2) to (score1, score2) """ # map from team to list of goals scored goals_scored = {} for key, entries in pairs.iteritems(): t1, t2 = key for entry in entries: g1, g2 = entry goals_scored.setdefault(t1, []).append(g1) goals_scored.setdefault(t2, []).append(g2) # make a list of average goals scored lams = [] for key, goals in goals_scored.iteritems(): lam = thinkstats.Mean(goals) lams.append(lam) # make the distribution of average goals scored cdf = thinkbayes.MakeCdfFromList(lams) thinkplot.Cdf(cdf) thinkplot.Show() mu, var = thinkstats.MeanVar(lams) print('mu, sig', mu, math.sqrt(var))
def p_value(delta, pcs, num_questions, iterations=10000): """Computes the probability of seeing a mean difference in exam scores >= delta for students with given values of pc.""" count = 0.0 for i in range(iterations): if thinkstats.Mean(fake_diffs(pcs, num_questions)) >= delta: count += 1 return count / iterations
def main2(): n = 1000 criteria = 10 matches = [1, 82] results, prob = [], [] for m in matches: results = [match(criteria, m) for i in range(n)] prob = thinkstats.Mean(results) * 100 print('matches %2d: prob = %.3f%%' % (m, prob))
def testVar(self): t = [1, 1, 1, 3, 3, 591] mu = thinkstats.Mean(t) var1 = thinkstats.Var(t) var2 = thinkstats.Var(t, mu) self.assertAlmostEqual(mu, 100.0) self.assertAlmostEqual(var1, 48217.0) self.assertAlmostEqual(var2, 48217.0)
def SummarizeWeightChange(self): """Print the mean reported change in weight in kg.""" data = [(r.weight2, r.wtyrago) for r in self.records if r.weight2 != 'NA' and r.wtyrago != 'NA'] changes = [(curr - prev) for curr, prev in data] print 'Mean change', thinkstats.Mean(changes)
def MakeFigure(): fp = open('babyboom.dat') # skip to the beginning of the data for line in fp: if line.find('START DATA') != -1: break # read a list of times times = [] for line in fp: t = line.split() time = int(t[-1]) times.append(time) # compute interarrival times diffs = [times[0]] for i in range(len(times)-1): diff = times[i+1] - times[i] diffs.append(diff) n = len(diffs) mu = thinkstats.Mean(diffs) print 'mean interarrival time', mu cdf = Cdf.MakeCdfFromList(diffs, 'actual') sample = [random.expovariate(1/mu) for i in range(n)] model = Cdf.MakeCdfFromList(sample, 'model') myplot.Cdf(cdf) myplot.Save(root='interarrivals', title='Time between births', xlabel='minutes', ylabel='CDF', legend=False, formats=['eps', 'png', 'pdf']) myplot.Cdfs([cdf, model], complement=True) myplot.Save(root='interarrivals_model', title='Time between births', xlabel='minutes', ylabel='Complementary CDF', yscale='log', formats=['eps', 'png', 'pdf']) pyplot.subplots_adjust(bottom=0.11) myplot.Cdf(cdf, complement=True) myplot.Save(root='interarrivals_logy', title='Time between births', xlabel='minutes', ylabel='Complementary CDF', yscale='log', legend=False, formats=['eps', 'png', 'pdf'])
def main(): lam = 0.5 n = 10 lams1 = [] lams2 = [] for _ in range(1000): data = samples('expo', n=n, lam=lam) s_mean = thinkstats.Mean(data) s_median = thinkstats.Median(data) lams1.append(1 / s_mean) lams2.append(math.log(2) / s_median) s_lam_mean1 = thinkstats.Mean(lams1) s_lam_mean2 = thinkstats.Mean(lams2) print("Mean(lams1) = %.3f, Mean(lams2) = %.3f" % (s_lam_mean1, s_lam_mean2)) print("MSE(lams1) = %.3f, MSE(lams2) = %.3f" % (CalculateMSE(lams1, lam), CalculateMSE(lams2, lam)))
def pumpkin(weights): """ Given an iterable of pumpkin weights, compute the sequence's mean, variance, and standard deviation. """ mean = thinkstats.Mean(weights) variance = thinkstats.Var(weights, mean) stddev = std_dev(weights, mean, variance) return mean, variance, stddev
def Process( table ): # Does this return anything? It does give us summary data that we can call later. Is it OK if a method doesn't return anything? """Runs analysis on the given table. Args: table: table object """ table.lengths = [p.prglength for p in table.records] table.n = len(table.lengths) table.mu = thinkstats.Mean(table.lengths)
def MakeLinePlot(age_bins): xs = [] ys = [] for bin, weights in sorted(age_bins.iteritems()): xs.append(bin) ys.append(thinkstats.Mean(weights)) myplot.Plot(xs, ys, 'bs-') myplot.Save(root='agemodel_line', xlabel="Mother's age (years)", ylabel='Mean birthweight (oz)', legend=False)
def EstimateRankits(n=6, m=1000): """Estimates the expected values of sorted random samples. n: sample size m: number of iterations Returns: list of n rankits """ t = Samples(n, m) t = zip(*t) means = [thinkstats.Mean(x) for x in t] return means
def Cov(xs, ys, mux=None, muy=None): """Computes Cov(X, Y). Args: xs: sequence of values ys: sequence of values mux: optional float mean of xs muy: optional float mean of ys Returns: Cov(X, Y) """ if mux is None: mux = thinkstats.Mean(xs) if muy is None: muy = thinkstats.Mean(ys) total = 0.0 for x, y in zip(xs, ys): total += (x - mux) * (y - muy) return total / len(xs)
def testMeanAndVar(self): t = [1, 2, 2, 3, 5] mu = thinkstats.Mean(t) var = thinkstats.Var(t, mu) pmf = Pmf.MakePmfFromList(t) mu2 = pmf.Mean() var2 = pmf.Var() var3 = pmf.Var(mu2) self.assertAlmostEquals(mu, mu2) self.assertAlmostEquals(var, var2) self.assertAlmostEquals(var, var3)
def Skewness(d): n = len(d) mean = thinkstats.Mean(d) m2 = 0 m3 = 0 for x in d: t = x - mean m2 += t**2 m3 += t**3 m2 /= n m3 /= n return m3 / (m2**(3 / 2))
def main(): for n in range(1, 20): m = [] # 一年中, 每天从样本n中选择最大的, 组成新的样本m for d in range(365): l = normal_sample(n, 950, 50) m.append(int(max(l))) # 计算样本m的mu, sigma mu = thinkstats.Mean(m) # 如果mu大于1000, 停止试验, 画图cdf if mu >= 1000: process(m) break
def testVar(self): t = [1, 1, 1, 3, 3, 591] mu = thinkstats.Mean(t) var1 = thinkstats.Var(t) var2 = thinkstats.Var(t, mu) print print 'Pumpkins' print 'mean', mu print 'var', var1 print 'var', var2 self.assertAlmostEquals(mu, 100.0) self.assertAlmostEquals(var1, 48217.0) self.assertAlmostEquals(var2, 48217.0)
def main(): suite = MakeUniformSuite(0.001, 1.5, 1000) evidence = [1.5, 2, 3, 4, 5, 12] Update(suite, evidence) suite.name = 'posterior' # plot the posterior distributions myplot.Pmf(suite) myplot.Show(title='Decay parameter', xlabel='Parameter (inverse cm)', ylabel='Posterior probability') print 'Naive parameter estimate:', 1.0 / thinkstats.Mean(evidence) print 'Mean of the posterior distribution:', suite.Mean()
def PlotCdfs(d, labels): """Plot CDFs for each sequence in a dictionary. Jitters the data and subtracts away the mean. d: map from key to sequence of values labels: map from key to string label """ thinkplot.Clf() for key, xs in d.iteritems(): mu = thinkstats.Mean(xs) xs = thinkstats.Jitter(xs, 1.3) xs = [x - mu for x in xs] cdf = thinkbayes.MakeCdfFromList(xs) thinkplot.Cdf(cdf, label=labels[key]) thinkplot.Show()
def Partition(ages, weights, bin_size=5): weight_dict = {} for age, weight in zip(ages, weights): bin = math.floor(age / bin_size) weight_dict.setdefault(bin, []).append(weight) print 'Bin', 'Mean weight (oz)' for bin, bin_weights in weight_dict.iteritems(): age = bin * bin_size try: mean = thinkstats.Mean(bin_weights) print age, mean except ZeroDivisionError: continue print return weight_dict
def Partition(ages, weights, bin_size=2): """Break ages into bins. Returns a map from age to list of weights. """ weight_dict = {} for age, weight in zip(ages, weights): bin = bin_size * math.floor(age / bin_size) + bin_size / 2.0 weight_dict.setdefault(bin, []).append(weight) for bin, bin_weights in weight_dict.iteritems(): try: mean = thinkstats.Mean(bin_weights) except ZeroDivisionError: continue return weight_dict