def main(): hypos = numpy.linspace(0, 12, 201) # start with a prior based on a pseudo observation # chosen to yield the right prior mean suite1 = Soccer(hypos, label='Germany') suite1.Update(0.34) suite2 = suite1.Copy(label='Argentina') # update with the results of World Cup 2014 final suite1.Update(1) suite2.Update(0) print('posterior mean Germany', suite1.Mean()) print('posterior mean Argentina', suite2.Mean()) # plot the posteriors thinkplot.PrePlot(2) thinkplot.Pdfs([suite1, suite2]) thinkplot.Show() # TODO: compute posterior prob Germany is better than Argentina # TODO: compute the Bayes factor of the evidence # compute predictive distributions for goals scored in a rematch pred1 = suite1.PredictiveDist(label='Germany') pred2 = suite2.PredictiveDist(label='Argentina') # plot the predictive distributions thinkplot.PrePlot(2) thinkplot.Pdfs([pred1, pred2]) thinkplot.Show()
def main(): """ """ user = User(label='user') beta = thinkbayes2.Beta(2, 1) for val, prob in beta.MakePmf().Items(): user.Set(val * 100, prob) thinkplot.Pdf(user) thinkplot.Show() print(user.Mean(), user.CredibleInterval(90)) mean_r = user.Mean() / 100.0 link = Link(range(0, 101), label='link') thinkplot.Pdf(link) thinkplot.Show() print(link.Mean(), link.CredibleInterval(90)) mean_q = link.Mean() / 100.0 user.Update(('up', mean_q)) thinkplot.Pdf(user) thinkplot.Show() print(user.Mean(), user.CredibleInterval(90)) link.Update(('up', mean_r)) thinkplot.Pdf(link) thinkplot.Show() print(link.Mean(), link.CredibleInterval(90)) return 0
def CH7_3(show = 1): """ 计算后验分布 棕熊队 加人队 bruins canucks 0 1 2 3 8 1 4 0 """ suite1 = Hockey('bruins') suite2 = Hockey('canucks') if show: thinkplot.Clf() thinkplot.PrePlot(num=2) thinkplot.Pmf(suite1) thinkplot.Pmf(suite2) thinkplot.Show(title='PRE', xlabel='Goals per game', ylabel='Probability') suite1.UpdateSet([0, 2, 8, 4]) suite2.UpdateSet([1, 3, 1, 0]) if show: # 观察最有可能lam的值, 每场比赛进球数的后验分布 thinkplot.Clf() thinkplot.PrePlot(num=2) thinkplot.Pmf(suite1) thinkplot.Pmf(suite2) thinkplot.Show(title='POST', xlabel='Goals per game', ylabel='Probability') return suite1, suite2
def CycleExtract(fw, data, pnum, trial, plane, marker, plot, plot2): if fw == 'AFO': choicedata = data[0] elif fw == 'PPAFO': choicedata = data[1] elif fw == 'Shoes': choicedata = data[2] strike_charac, strike_loc = HeelStrike(fw, data, pnum, trial, marker, plot) num_cycles = len(strike_charac) dataframe = choicedata[pnum].GetTrial(trial).GetData(plane) cycle_set = [] for i in range(num_cycles - 1): start_rowindex = strike_charac[i][0] + 40 end_rowindex = strike_charac[i + 1][0] + 50 cycle = dataframe[start_rowindex:end_rowindex] index = range(start_rowindex, end_rowindex, 1) cycle_set.append((index, cycle)) if plot2 == True: for j in range(len(cycle_set)): index, cycle = cycle_set[j] thinkplot.Plot(dataframe['R_HEEL'], color='blue', label='Right full set') thinkplot.Plot(index, cycle['R_HEEL'], color='red', label='Right cycle set') thinkplot.Show(legend=True) thinkplot.Plot(dataframe['L_HEEL'], color='blue', label='Left full set') thinkplot.Plot(index, cycle['L_HEEL'], color='red', label='Left cycle set') thinkplot.Show(legend=True) return cycle_set
def sin_spectrum(): wave = thinkdsp.make_note(69, 0.5, SinSignal) spectrum = wave.spectrum() spectrum.plot() thinkplot.Show() peaks = spectrum.peaks() print peaks[0] wave2 = spectrum.make_wave() wave2.plot() thinkplot.Show() wave2.write()
def main(): filename = 'mystery0.dat' data = read_file(filename) cdf = thinkstats2.MakeCdfFromList(data) thinkplot.SubPlot(2, 3, 1) thinkplot.Cdf(cdf) thinkplot.Config(title='linear') thinkplot.SubPlot(2, 3, 2) scale = thinkplot.Cdf(cdf, xscale='log') thinkplot.Config(title='logx', **scale) thinkplot.SubPlot(2, 3, 3) scale = thinkplot.Cdf(cdf, transform='exponential') thinkplot.Config(title='expo', **scale) thinkplot.SubPlot(2, 3, 4) xs, ys = thinkstats2.NormalProbability(data) thinkplot.Plot(xs, ys) thinkplot.Config(title='normal') thinkplot.SubPlot(2, 3, 5) scale = thinkplot.Cdf(cdf, transform='pareto') thinkplot.Config(title='pareto', **scale) thinkplot.SubPlot(2, 3, 6) scale = thinkplot.Cdf(cdf, transform='weibull') thinkplot.Config(title='weibull', **scale) thinkplot.Show()
def main(script, filename='mystery0.dat'): data = ReadFile(filename) cdf = thinkstats2.Cdf(data) thinkplot.PrePlot(num=6, rows=2, cols=3) thinkplot.SubPlot(1) thinkplot.Cdf(cdf, color='C0', label=filename) thinkplot.Config(title='CDF on linear scale', ylabel='CDF') thinkplot.SubPlot(2) scale = thinkplot.Cdf(cdf, xscale='log', color='C0') thinkplot.Config(title='CDF on log-x scale', ylabel='CDF', **scale) thinkplot.SubPlot(3) scale = thinkplot.Cdf(cdf, transform='exponential', color='C0') thinkplot.Config(title='CCDF on log-y scale', ylabel='log CCDF', **scale) thinkplot.SubPlot(4) xs, ys = thinkstats2.NormalProbability(data) thinkplot.Plot(xs, ys, color='C0') thinkplot.Config(title='Normal probability plot', xlabel='random normal', ylabel='data') thinkplot.SubPlot(5) scale = thinkplot.Cdf(cdf, transform='pareto', color='C0') thinkplot.Config(title='CCDF on log-log scale', ylabel='log CCDF', **scale) thinkplot.SubPlot(6) scale = thinkplot.Cdf(cdf, transform='weibull', color='C0') thinkplot.Config(title='CCDF on loglog-y log-x scale', ylabel='log log CCDF', **scale) thinkplot.Show(legend=False)
def ClassSizes(): # start with the actual distribution of class sizes from the book d = { 7: 8, 12: 8, 17: 14, 22: 4, 27: 6, 32: 12, 37: 8, 42: 3, 47: 2, } # form the pmf pmf = thinkstats2.MakePmfFromDict(d, 'actual') print 'mean', pmf.Mean() print 'var', pmf.Var() # compute the biased pmf biased_pmf = BiasPmf(pmf, 'observed') print 'mean', biased_pmf.Mean() print 'var', biased_pmf.Var() # unbias the biased pmf unbiased_pmf = UnbiasPmf(biased_pmf, 'unbiased') print 'mean', unbiased_pmf.Mean() print 'var', unbiased_pmf.Var() # plot the Pmfs thinkplot.Pmfs([pmf, biased_pmf]) thinkplot.Show(xlabel='Class size', ylabel='PMF')
def main(): k = 15 f = 0.1 # plot Detector suites for a range of hypothetical r thinkplot.PrePlot(num=3) for r in [100, 250, 400]: suite = Detector(r, f, step=1) suite.Update(k) thinkplot.Pmf(suite) print(suite.MaximumLikelihood()) thinkplot.Show(xlabel='Number of particles (n)', ylabel='PMF') return # plot the posterior distributions of r and n hypos = range(1, 501, 5) suite = Emitter2(hypos, f=f) suite.Update(k) thinkplot.PrePlot(num=2) post_r = suite.DistOfR(name='posterior r') post_n = suite.DistOfN(name='posterior n') thinkplot.Pmf(post_r) thinkplot.Pmf(post_n) thinkplot.Save(root='jaynes2', xlabel='Emission rate', ylabel='PMF', formats=FORMATS)
def scatter(x): tot_crimes = df.Total_crimes thinkplot.Scatter(df[x], tot_crimes, alpha=.5) if x == 'month': thinkplot.Show(title="Total Crimes vs Time", xlabel="Year", ylabel="Total Crimes") else: thinkplot.Show(title="Total Crimes vs " + x + " Crimes", xlabel=x + " Crimes", ylabel="Total Crimes") print(x + " crime stats") print("Spearman's correlation:", thinkstats2.SpearmanCorr(tot_crimes, df[x])) print("Covariance:", thinkstats2.Cov(tot_crimes, df[x])) print()
def main(): probs = numpy.linspace(0, 1, 101) hypos = [] for q in probs: for r in probs: hypos.append((q, r)) suite = Volunteer(hypos) # update the Suite with the larger sample of students who # signed up and reported data = 140, 50 suite.Update(data) # update again with the smaller sample of students who signed # up, participated, and reported data = 5, 3, 1 suite.Update(data) #p_marginal = MarginalProduct(suite) q_marginal = MarginalDistribution(suite, 0) r_marginal = MarginalDistribution(suite, 1) thinkplot.Pmf(q_marginal, label='q') thinkplot.Pmf(r_marginal, label='r') #thinkplot.Pmf(p_marginal) thinkplot.Show()
def main(): suite = Version3() print(suite.Mean()) thinkplot.Pdf(suite) thinkplot.Show(legend=False)
def main(): suite = Version3() print suite.Mean() thinkplot.Pmf(suite) thinkplot.Show()
def main(): df = hinc.ReadData() log_sample = InterpolateSample(df, log_upper=6.0) log_cdf = thinkstats2.Cdf(log_sample) print("median", thinkstats2.Median(log_sample)) print("pearson's median skewness", thinkstats2.PearsonMedianSkewness(log_sample)) print("skewness", thinkstats2.Skewness(log_sample)) print("mean", log_cdf.Mean()) print( "the higher our log_upper, the more right-skewed (according to g_1) or at least less left-skewed (according to g_p) things get" ) print("the mean moves to the right a bit, too.") print("proportion of the population with income < mean", log_cdf.Prob(log_cdf.Mean())) print( "the higher the upper bound, the greater the proprtion below the mean." ) thinkplot.Cdf(log_cdf) thinkplot.Show(xlabel='household income', ylabel='CDF')
def main(): suite = Euro(range(0, 101)) suite.Update('H') thinkplot.Pdf(suite) thinkplot.Show(xlabel='x', ylabel='Probability', legend=False)
def Specific_Character(House, Gender, Class, ksweep, lamsweep, Title=''): """Knits many function together to produce a prediction for a given house, gender and class The house can be any key in hd, class can be 'Noble' or 'Small' or 'All' , and the gender can be 'M' or 'F' or 'All'. This also needs to make a linspace for k and lambda, so ksweep and lsweep are lists of the form [lower limit, upper limit, number of points]. You can also choose what to title your graph.""" hd = PrepData() #Get the data alive, dead = char_lists(hd, House, Gender, Class) #Sort by alive/dead for given attributes introductions, lifetimes = ages(alive, dead) #Get ages and lifespans sf, haz = SurvivalHaz(introductions, lifetimes) #Use kaplan-meyer lam = thinkbayes2.MakeUniformPmf(lamsweep[0], lamsweep[1], lamsweep[2]) #Our uniform priors k = thinkbayes2.MakeUniformPmf(ksweep[0], ksweep[1], ksweep[2]) k, lam = MakeDistr(introductions, lifetimes, k, lam) #Get our posterior thinkplot.PrePlot(2) thinkplot.Pdfs([k, lam]) plt.xlabel('Value') plt.ylabel('Probability') plt.title('Posterior Distributions') print('If these distributions look chopped off, adjust kweep and lsweep') thinkplot.Show() mk = k.Mean() ml = lam.Mean() kl, kh = k.Percentile(5), k.Percentile(95) ll, lh = lam.Percentile(5), lam.Percentile(95) CredIntPlt(sf, kl, kh, ll, lh, House, mk, ml, Title) plt.show()
def CH5_5(): """ 最大值操作: 转动3个6面的骰子, 计算它们的最大值 采用下面三种方式, 对比分布图. 模拟: 枚举: 指数计算: """ d6 = Die(6) k = 3 # 模拟 N = 1000 dists = [d6] * k pmf = SampleMax(dists, N) pmf.name = 'sim' thinkplot.Pmf(pmf) # 枚举 km^2 pmf = PmfMax(d6, d6) print("pmf1.Total() = %.3f" % pmf.Total()) pmf = PmfMax(pmf, d6) print("pmf2.Total() = %.3f" % pmf.Total()) pmf.name = 'enum' thinkplot.Pmf(pmf) # CDF (指数max) TODO 不是很明白??? cdf = d6.Max(k) cdf.name = "expo" thinkplot.Cdf(cdf) thinkplot.Show(xlabel='max([d6]*3)', ylabel='probablity')
def CH5_4(): """ 加法操作: 转动3个6面的骰子, 计算它们的和, 采用下面两种方式, 对比分布图. 模拟: 通过模拟随机样品, 累积和. 枚举: 枚举所有可能的数字对 """ d6 = Die(6) k = 3 print("mean(d6) = %.3f, sum(probs) = %.3f" % (d6.Mean(), d6.Total())) # 模拟: 3个骰子分布, N越大越精确. 缺点: 耗时. N = 1000 dists = [d6] * k pmf = thinkbayes.SampleSum(dists, N) pmf.name = 'sim' thinkplot.Pmf(pmf) print("mean([d6]*3) = %.3f, sum(*) = %.3f" % (pmf.Mean(), pmf.Total())) # 枚举: x数值相加, y概率相乘 pmf = d6 + d6 + d6 pmf.name = 'enum' thinkplot.Pmf(pmf) thinkplot.Show(xlabel='sum([d6]*3)', ylabel='probablity') print("mean([d6]*3) = %.3f, sum(*) = %.3f" % (pmf.Mean(), pmf.Total()))
def ProcessScoresTeamwise(pairs): """Average number of goals for each team. pairs: map from (team1, team2) to (score1, score2) """ # map from team to list of goals scored goals_scored = {} for key, entries in pairs.iteritems(): t1, t2 = key for entry in entries: g1, g2 = entry goals_scored.setdefault(t1, []).append(g1) goals_scored.setdefault(t2, []).append(g2) # make a list of average goals scored lams = [] for key, goals in goals_scored.iteritems(): lam = thinkbayes2.Mean(goals) lams.append(lam) # make the distribution of average goals scored cdf = thinkbayes2.MakeCdfFromList(lams) thinkplot.Cdf(cdf) thinkplot.Show() mu, var = thinkbayes2.MeanVar(lams) print('mu, sig', mu, math.sqrt(var))
def CH7_2(): """ http://www.ruanyifeng.com/blog/2015/06/poisson-distribution.html 1. 一场比赛平均进球数为lam, 每场比赛进球分布: 泊松分布 (进球可以在任何时间点发生) eg. 某医院平均每小时出生3个婴儿 重点是: 次数 2. 进球间隔的分布: 指数分布 eg. 某医院婴儿出生的时间间隔(20分钟一个(0.3h)) 重点是: 间隔 泊松分布是单位时间内独立事件发生次数的概率分布 指数分布是独立事件的时间间隔的概率分布 """ # 单位时间内出生1 - 10个婴儿的泊松分布 pmf = thinkbayes.MakePoissonPmf(3, 10, step=1) thinkplot.Clf() thinkplot.Pmf(pmf) # thinkplot.Show(); # 婴儿出生时间间隔(20分钟) pmf = thinkbayes.MakeExponentialPmf(0.3, 10, n=200) thinkplot.Clf() thinkplot.Pmf(pmf) thinkplot.Show();
def print_num_albums_per_artist(all_genres): num_albums_counts = {} num_albums_list = [] for artist, albums in all_genres.items(): num_albums = len(albums) num_albums_list.append(num_albums) if num_albums in num_albums_counts: num_albums_counts[num_albums] += 1 else: num_albums_counts[num_albums] = 1 num_artists = len(all_genres) num_albums = sum(num_albums_list) print("In total,", num_artists, "artists, producing", num_albums, "albums.") print("An average of", "%.2f" % (num_albums / num_artists), "albums per artist.") num_albums_hist = ts2.Hist(num_albums_counts) artists_more_than_6_albums = sum( [v for k, v in num_albums_hist.Items() if k > 6]) print(artists_more_than_6_albums, 'artists with more than 6 albums.') tp.Hist(num_albums_hist) tp.Show(xlabel='Number of albums', ylabel='Count of artists with this number of albums', title='Histogram of the number of albums per artist')
def MakePlot(self): """Plot the CDFs.""" thinkplot.Cdf(self.pmf_y.MakeCdf()) thinkplot.Cdf(self.prior_zb.MakeCdf()) thinkplot.Cdf(self.post_zb.MakeCdf()) thinkplot.Cdf(self.pmf_mean_zb.MakeCdf()) thinkplot.Show()
def main(): df = hinc.ReadData() log_sample = InterpolateSample(df, log_upper=6.0) log_cdf = thinkstats2.Cdf(log_sample) thinkplot.Cdf(log_cdf) thinkplot.Show(xlabel='household income', ylabel='CDF')
def main(): filename = 'mystery0.dat' data = read_file(filename) pmf = thinkstats2.MakePmfFromList(data) cdf = thinkstats2.MakeCdfFromList(data) pdf = thinkstats2.EstimatedPdf(data) low, high = min(data), max(data) xs = numpy.linspace(low, high, 101) kde_pmf = pdf.MakePmf(xs) bin_data = BinData(data, low, high, 51) bin_pmf = thinkstats2.MakePmfFromList(bin_data) thinkplot.SubPlot(2, 2, 1) thinkplot.Hist(pmf, width=0.1) thinkplot.Config(title='Naive Pmf') thinkplot.SubPlot(2, 2, 2) thinkplot.Hist(bin_pmf) thinkplot.Config(title='Binned Hist') thinkplot.SubPlot(2, 2, 3) thinkplot.Pmf(kde_pmf) thinkplot.Config(title='KDE PDF') thinkplot.SubPlot(2, 2, 4) thinkplot.Cdf(cdf) thinkplot.Config(title='CDF') thinkplot.Show()
def main(): hypos = numpy.linspace(0, 12, 201) suite = Soccer(hypos) # the mean number of goals per game was 2.67 mean_rate = 2.67 / 2 mean_interarrival = 90 / mean_rate # start with a prior based on the mean interarrival time suite.Update(mean_interarrival) thinkplot.Pdf(suite, label='prior') print('prior mean', suite.Mean()) suite.Update(11) thinkplot.Pdf(suite, label='posterior 1') print('after one goal', suite.Mean()) suite.Update(12) thinkplot.Pdf(suite, label='posterior 2') print('after two goals', suite.Mean()) thinkplot.Show() # plot the predictive distribution suite.PredRemaining(90 - 23, 2)
def ProcessScoresPairwise(pairs): """Average number of goals for each team against each opponent. pairs: map from (team1, team2) to (score1, score2) """ # map from (team1, team2) to list of goals scored goals_scored = {} for key, entries in pairs.iteritems(): t1, t2 = key for entry in entries: g1, g2 = entry goals_scored.setdefault((t1, t2), []).append(g1) goals_scored.setdefault((t2, t1), []).append(g2) # make a list of average goals scored lams = [] for key, goals in goals_scored.iteritems(): if len(goals) < 3: continue lam = thinkstats.Mean(goals) lams.append(lam) # make the distribution of average goals scored cdf = thinkbayes.MakeCdfFromList(lams) thinkplot.Cdf(cdf) thinkplot.Show() mu, var = thinkstats.MeanVar(lams) print('mu, sig', mu, math.sqrt(var)) print('BOS v VAN', pairs['BOS', 'VAN'])
def main(): coords = numpy.linspace(-100, 100, 101) joint = Gps(product(coords, coords)) joint.Update((51, -15)) joint.Update((48, 90)) pairs = [(11.903060613102866, 19.79168669735705), (77.10743601503178, 39.87062906535289), (80.16596823095534, -12.797927542984425), (67.38157493119053, 83.52841028148538), (89.43965206875271, 20.52141889230797), (58.794021026248245, 30.23054016065644), (2.5844401241265302, 51.012041625783766), (45.58108994142448, 3.5718287379754585)] joint.UpdateSet(pairs) thinkplot.PrePlot(2) pdfx = joint.Marginal(0) pdfy = joint.Marginal(1) thinkplot.Pdf(pdfx, label='posterior x') thinkplot.Pdf(pdfy, label='posterior y') thinkplot.Show() print(pdfx.Mean(), pdfx.Std()) print(pdfy.Mean(), pdfy.Std())
def main(): fair = Euro() fair.Set(50, 1) bias = Euro() for x in range(0, 51): bias.Set(x, x) for x in range(51, 101): bias.Set(x, 100-x) bias.Normalize() thinkplot.Pmf(bias) thinkplot.Show() # notice that we've changed the representation of the data data = 140, 110 like_bias = AverageLikelihood(bias, data) print 'like_bias', like_bias like_fair = AverageLikelihood(fair, data) print 'like_fair', like_fair ratio = like_bias / like_fair print 'Bayes factor', ratio
def main(script, filename='mystery0.dat'): data = ReadFile(filename) cdf = thinkstats2.Cdf(data) thinkplot.PrePlot(rows=2, cols=3) thinkplot.SubPlot(1) thinkplot.Cdf(cdf) thinkplot.Config(title='linear') thinkplot.SubPlot(2) scale = thinkplot.Cdf(cdf, xscale='log') thinkplot.Config(title='logx', **scale) thinkplot.SubPlot(3) scale = thinkplot.Cdf(cdf, transform='exponential') thinkplot.Config(title='expo', **scale) thinkplot.SubPlot(4) xs, ys = thinkstats2.NormalProbability(data) thinkplot.Plot(xs, ys) thinkplot.Config(title='normal') thinkplot.SubPlot(5) scale = thinkplot.Cdf(cdf, transform='pareto') thinkplot.Config(title='pareto', **scale) thinkplot.SubPlot(6) scale = thinkplot.Cdf(cdf, transform='weibull') thinkplot.Config(title='weibull', **scale) thinkplot.Show(legend=False)
def CH3_2(): """ 火车头问题(Train) 有一天看到一个编号60的火车头经过, 论共有多少个火车头? 假设 上限 N = 1000, 500, 2000 猜测结果对上限敏感 实际N个火车头, 假设看到了60号火车头 1 1/N 0 2 1/N 0 ... ... ... 59 1/N 0 60 1/N 1/60 61 1/N 1/61 ... ... ... 1000 1/N 1/1000 """ # 假设有1 - 1000个编号的火车头 N = 1000 hypoes = range(1, N) suite = Train(hypoes) suite.Update(60) thinkplot.PrePlot(num=1) thinkplot.Pmf(suite) thinkplot.Show(title='Train', xlabel='Number of trains', ylabel='Probability') print(suite.Mean())