def ClassSizes(): """Generate PMFs of observed and actual class size. """ # start with the actual distribution of class sizes from the book d = {7: 8, 12: 8, 17: 14, 22: 4, 27: 6, 32: 12, 37: 8, 42: 3, 47: 2} # form the pmf pmf = thinkstats2.Pmf(d, label='actual') print('mean', pmf.Mean()) print('var', pmf.Var()) # compute the biased pmf biased_pmf = BiasPmf(pmf, label='observed') print('mean', biased_pmf.Mean()) print('var', biased_pmf.Var()) # unbias the biased pmf unbiased_pmf = UnbiasPmf(biased_pmf, label='unbiased') print('mean', unbiased_pmf.Mean()) print('var', unbiased_pmf.Var()) # plot the Pmfs thinkplot.PrePlot(2) thinkplot.Pmfs([pmf, biased_pmf]) thinkplot.Save(root='class_size1', xlabel='class size', ylabel='PMF', axis=[0, 52, 0, 0.27])
def MakePlot(self, root='redline2'): """Plots the computed CDFs. root: string """ print 'Mean z', self.pmf_z.Mean() / 60 print 'Mean zb', self.pmf_zb.Mean() / 60 print 'Mean y', self.pmf_y.Mean() / 60 cdf_z = self.pmf_z.MakeCdf() cdf_zb = self.pmf_zb.MakeCdf() cdf_y = self.pmf_y.MakeCdf() cdfs = ScaleDists([cdf_z, cdf_zb, cdf_y], 1.0 / 60) thinkplot.Clf() thinkplot.PrePlot(3) thinkplot.Cdfs(cdfs) thinkplot.Save(root=root, xlabel='Time (min)', ylabel='CDF', formats=FORMATS) root += 'a' pmfs = self.pmf_z, self.pmf_zb, self.pmf_y pmfs = ScaleDists(pmfs, 1.0 / 60) thinkplot.PrePlot(3) thinkplot.Pmfs(pmfs) thinkplot.Save(root=root, xlabel='Time (min)', ylabel='Probability', formats=FORMATS)
def ClassSizes(): # start with the actual distribution of class sizes from the book d = { 7: 8, 12: 8, 17: 14, 22: 4, 27: 6, 32: 12, 37: 8, 42: 3, 47: 2, } # form the pmf pmf = thinkstats2.MakePmfFromDict(d, 'actual') print 'mean', pmf.Mean() print 'var', pmf.Var() # compute the biased pmf biased_pmf = BiasPmf(pmf, 'observed') print 'mean', biased_pmf.Mean() print 'var', biased_pmf.Var() # unbias the biased pmf unbiased_pmf = UnbiasPmf(biased_pmf, 'unbiased') print 'mean', unbiased_pmf.Mean() print 'var', unbiased_pmf.Var() # plot the Pmfs thinkplot.Pmfs([pmf, biased_pmf]) thinkplot.Show(xlabel='Class size', ylabel='PMF')
def MakeStep(greq, less): axis = [0, 50, 0, 0.6] greqpmf = thinkstats2.Pmf(greq.prglngth, label='greater/equal to 30') lesspmf = thinkstats2.Pmf(less.prglngth, label='less than 30') thinkplot.Pmfs([greqpmf, lesspmf]) thinkplot.Config(xlabel='Pregnancy length(weeks)', axis=axis) thinkplot.Show()
def MakePrice1(player1, player2): """ plot the prior distribution of price for both players""" thinkplot.Clf() thinkplot.PrePlot(num=2) pmf1 = player1.PmfPrice() pmf1.name = 'showcase 1' pmf2 = player2.PmfPrice() pmf2.name = 'showcase 2' thinkplot.Pmfs([pmf1, pmf2])
def CH7_4(show = 1): """ 混合分布 """ suite1, suite2 = CH7_3(0) # 均值: mu1 = suite1.Mean() mu2 = suite2.Mean() print("Mean1: ", mu1) print("Mean2: ", mu2) if show: # 使用均值, 计算泊松分布 (下一场比赛进球分布) pos1 = thinkbayes.MakePoissonPmf(mu1, 10, step=1) pos2 = thinkbayes.MakePoissonPmf(mu2, 10, step=1) thinkplot.Clf() thinkplot.PrePlot(num=2) thinkplot.Pmfs([pos1, pos2]) thinkplot.Show(title='Poisson', xlabel='Goals per game', ylabel='Probability') # 混合分布 def _MixPmf(suite): high = 10 metapmf = thinkbayes.Pmf() for lam, prob in suite.Items(): pmf = thinkbayes.MakePoissonPmf(lam, high, step=1) metapmf.Set(pmf, y=prob) return thinkbayes.MakeMixture(metapmf, name='mix') mix1 = _MixPmf(suite1) mix2 = _MixPmf(suite2) if show: thinkplot.Clf() thinkplot.PrePlot(num=2) thinkplot.Pmfs([mix1, mix2]) thinkplot.Show(title='Mixture', xlabel='Goals per game', ylabel='Probability') return mix1, mix2
def PlotSuites(suites, root): """Plots two suites. suite1, suite2: Suite objects root: string filename to write """ thinkplot.Clf() thinkplot.PrePlot(len(suites)) thinkplot.Pmfs(suites) thinkplot.Show(xlabel='Heads probability', ylabel='PMF')
def PlotBeliefs(self, root): """Plots prior and posterior beliefs. root: string filename root for saved figure """ thinkplot.Clf() thinkplot.PrePlot(num=2) thinkplot.Pmfs([self.prior, self.posterior]) thinkplot.Save(root=root, xlabel='price ($)', ylabel='PMF', formats=FORMATS)
def MakeStep(male, female): axis = [0, 800, 0, 0.1] malepmf = thinkstats2.Pmf(male.alcwknd, label='Male') femalepmf = thinkstats2.Pmf(female.alcwknd, label='Female') thinkplot.Pmfs([malepmf, femalepmf]) thinkplot.Config(xlabel='Alcohol Consumption (grams)', ylabel='PMF', axis=axis, title='Weekend Alcohol Consumption') thinkplot.Show()
def PlotSuites(suites, root): """Plots two suites. suite1, suite2: Suite objects root: string filename to write """ thinkplot.Clf() thinkplot.PrePlot(len(suites)) thinkplot.Pmfs(suites) thinkplot.Save(root=root, xlabel='x', ylabel='Probability', formats=['pdf', 'eps'])
def plot_bar_step(first_pmf, other_pmf): """PrePlot takes optional parameters rows and cols to make a grid of figures for bar grapg""" width = 0.5 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(first_pmf, align="left", width=width) thinkplot.Hist(other_pmf, align="right", width=width) thinkplot.Config(xlabel="weeks", ylabel="probability", axis=[27, 46, 0, 0.6]) #for step graph thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([first_pmf, other_pmf]) thinkplot.Show(xlabel="weeks", axis=[27, 46, 0, 0.6])
def main(): results = ReadResults() speeds = GetSpeeds(results) speeds = BinData(speeds, 3, 12, 100) pmf = thinkstats2.Pmf(speeds, 'speeds') pmf2 = ObservedPmf(pmf, 7.5) thinkplot.PrePlot(2) thinkplot.Pmfs([pmf, pmf2]) thinkplot.Show(title='PMF of running speed', xlabel='speed (mph)', ylabel='probability')
def main(): ## extract data results = ReadResults() speeds = GetSpeeds(results) speeds = BinData(speeds, 3, 12, 100) ## make and plot pmfs pmf = thinkstats2.Pmf(speeds, 'speeds') observed_pmf = ObservedPmf(pmf, label='observed') thinkplot.PrePlot(2) thinkplot.Pmfs([pmf, observed_pmf]) thinkplot.Show(xlabel='speed (mph)', axis=[4, 12, 0, 0.1])
def pmf_stuff(width, x_low, x_high, third, pmf_one, pmf_two, label, y_axis_scale): width = width axis = [x_low, x_high, third, y_axis_scale] thinkplot.PrePlot(2, cols=2) thinkplot.Hist(pmf_one, align='right', width=width) thinkplot.Hist(pmf_two, align='left', width=width) thinkplot.Config(xlabel=label, ylabel='PMF', axis=axis) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([pmf_one, pmf_two]) thinkplot.Config(xlabel=label, ylabel='PMF', axis=axis) thinkplot.Show()
def PlotPmfs(self, root='redline0'): """Plots the computed Pmfs. root: string """ pmfs = ScaleDists([self.pmf_z, self.pmf_zb], 1.0 / 60) thinkplot.Clf() thinkplot.PrePlot(2) thinkplot.Pmfs(pmfs) thinkplot.Save(root=root, xlabel='Time (min)', ylabel='CDF', formats=FORMATS)
def MakePlot(self, root='redline3'): """Plot the CDFs. root: string """ # observed gaps cdf_prior_x = self.prior_x.MakeCdf() cdf_post_x = self.post_x.MakeCdf() cdf_y = self.pmf_y.MakeCdf() cdfs = ScaleDists([cdf_prior_x, cdf_post_x, cdf_y], 1.0 / 60) thinkplot.Clf() thinkplot.PrePlot(3) thinkplot.Cdfs(cdfs) thinkplot.Save(root=root, xlabel='Time (min)', ylabel='CDF', formats=FORMATS) pmfs = self.prior_x, self.post_x pmfs = ScaleDists(pmfs, 1.0 / 60) thinkplot.PrePlot(3) thinkplot.Pmfs(pmfs) thinkplot.Save(root=root + 'a', xlabel='Time (min)', ylabel='Probability', formats=FORMATS) pmfs = self.prior_x, self.post_x, self.pmf_y pmfs = ScaleDists(pmfs, 1.0 / 60) thinkplot.PrePlot(3) thinkplot.Pmfs(pmfs) thinkplot.Save(root=root + 'b', xlabel='Time (min)', ylabel='Probability', formats=FORMATS)
def main(): results = ReadResults() speeds = GetSpeeds(results) speeds = BinData(speeds, 3, 12, 100) pmf = thinkstats2.Pmf(speeds, 'speeds') pmf_biased = ObservedPmf(pmf, 7, 'biased speeds') thinkplot.PrePlot(2) thinkplot.Pmfs([pmf, pmf_biased]) thinkplot.Show(title='PMF of running speed', xlabel='speed (mph)', ylabel='probability', axis=[0, 13, 0, 0.27])
def main(): p1 = thinkbayes2.MakeNormalPmf(0, 1, 3, n=101) p1.label = 'p1' p2 = p1.Copy(label='p2') q1 = thinkbayes2.MakeNormalPmf(0, 1, 3, n=101) q1.label = 'q1' q2 = q1.Copy(label='q2') p1, q1 = Update(p1, q1, True) p1, q2 = Update(p1, q2, True) p2, q1 = Update(p2, q1, True) p2, q2 = Update(p2, q2, False) thinkplot.PrePlot(num=4, rows=2) thinkplot.Pmfs([p1, p2]) thinkplot.Config(legend=True) thinkplot.SubPlot(2) thinkplot.Pmfs([q1, q2]) thinkplot.Show() print('Prob p1 > p2', p1 > p2) print('Prob q1 > q2', q1 > q2)
def PlotSuites(suites, root): """Plots two suites. suite1, suite2: Suite objects root: string filename to write """ formats = ['pdf', 'png'] thinkplot.Clf() thinkplot.PrePlot(len(suites)) thinkplot.Pmfs(suites) thinkplot.Save(root=root, xlabel='Percentage of Active Female Users', ylabel='Probability', formats=formats, legend=True)
def main(): exam = Exam() alice = Sat(exam) alice.name = 'alice' alice.Update(780) bob = Sat(exam) bob.name = 'bob' bob.Update(760) print 'Prob Alice is "smarter":', PmfProbGreater(alice, bob) print 'Prob Bob is "smarter":', PmfProbGreater(bob, alice) thinkplot.Pmfs([alice, bob]) thinkplot.Show(xlabel='x', ylabel='Probability')
def MakeFigures(firsts, others): """Plot Pmfs of pregnancy length. firsts: DataFrame others: DataFrame """ # plot the PMFs first_pmf = thinkstats2.Pmf(firsts.prglngth, label='first') other_pmf = thinkstats2.Pmf(others.prglngth, label='other') width = 0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(first_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Config(xlabel='weeks', ylabel='probability', axis=[27, 46, 0, 0.6]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([first_pmf, other_pmf]) thinkplot.Save(root='probability_nsfg_pmf', xlabel='weeks', axis=[27, 46, 0, 0.6]) # plot the differences in the PMFs weeks = range(35, 46) diffs = [] for week in weeks: p1 = first_pmf.Prob(week) p2 = other_pmf.Prob(week) diff = 100 * (p1 - p2) diffs.append(diff) thinkplot.Bar(weeks, diffs) thinkplot.Save(root='probability_nsfg_diffs', title='Difference in PMFs', xlabel='weeks', ylabel='percentage points', legend=False)
def MakePlots(player1, player2): """Generates two plots. price1 shows the priors for the two players price2 shows the distribution of diff for the two players """ # plot the prior distribution of price for both players thinkplot.Clf() thinkplot.PrePlot(num=2) pmf1 = player1.PmfPrice() pmf1.name = 'showcase 1' pmf2 = player2.PmfPrice() pmf2.name = 'showcase 2' thinkplot.Pmfs([pmf1, pmf2]) thinkplot.Save(root='price1', xlabel='price ($)', ylabel='PDF', formats=FORMATS) # plot the historical distribution of underness for both players thinkplot.Clf() thinkplot.PrePlot(num=2) cdf1 = player1.CdfDiff() cdf1.name = 'player 1' cdf2 = player2.CdfDiff() cdf2.name = 'player 2' print 'Player median', cdf1.Percentile(50) print 'Player median', cdf2.Percentile(50) print 'Player 1 overbids', player1.ProbOverbid() print 'Player 2 overbids', player2.ProbOverbid() thinkplot.Cdfs([cdf1, cdf2]) thinkplot.Save(root='price2', xlabel='diff ($)', ylabel='CDF', formats=FORMATS)
def CH6_2(price1, price2): """ 两组展览品的价格分布 """ thinkplot.Clf() thinkplot.PrePlot(num=2) # 因为price变量值没有重复的, 所以PMF绘图是看不出什么的. # price1_pmf = thinkbayes.MakePmfFromList(price1, name='showcase1') # price2_pmf = thinkbayes.MakePmfFromList(price2, name='showcase2') price1_max = max(price1) price2_max = max(price2) price_max = max(price1_max, price2_max) xs = numpy.linspace(0, price_max + 100, num=150) price1_pdf = thinkbayes.EstimatedPdf(price1) price2_pdf = thinkbayes.EstimatedPdf(price2) price1_pmf = price1_pdf.MakePmf(xs, name='showcase1') price2_pmf = price2_pdf.MakePmf(xs, name='showcase2') thinkplot.Pmfs([price1_pmf, price2_pmf]) thinkplot.Show(xlabel='price $', ylabel='PMF')
firsts = tmp[tmp.birthord == 1] others = tmp[tmp.birthord != 1] first_pmf = thinkstats2.Pmf(firsts.prglngth) others_pmf = thinkstats2.Pmf(others.prglngth) # %% # 棒グラフ表示 width = 0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(first_pmf, align='right', width=width) thinkplot.Hist(others_pmf, align='left', width=width) thinkplot.Config(xlabl='week', ylabel='probability', axis=[27, 46, 0, 0.6]) thinkplot.show() # %% # ステップ関数表示 thinkplot.PrePlot(2) thinkplot.Pmfs([first_pmf, others_pmf]) thinkplot.show(xlabl='week', ylabel='probability', axis=[27, 46, 0, 0.6]) # %% [markdown] # ## 3.3 その他の可視化 # %% # 差を棒グラフで表示 weeks = range(35, 46) diffs = [] for week in weeks: p1 = first_pmf.Prob(week) p2 = others_pmf.Prob(week) diff = 100 * (p1 - p2) diffs.append(diff) thinkplot.Bar(weeks, diffs) # %% [markdown]
#Scenario 1 : Compare pmf of daily time spent for male vs female male_ds=advertisement_data[advertisement_data.Male==1] female_ds=advertisement_data[advertisement_data.Male==0] male_pmf = thinkstats2.Pmf(male_ds.Daily_Time_Spent, label='male') female_pmf = thinkstats2.Pmf(female_ds.Daily_Time_Spent, label='female') #Plot pmf of daily time spent for male and female width=20 axis = [30, 90, 0, 0.01] thinkplot.PrePlot(2) #thinkplot.SubPlot(2) thinkplot.Pmfs([male_pmf, female_pmf]) thinkplot.Config(xlabel='Daily Time Spent in Minutes', axis=axis) thinkplot.show() #Scenario 2: Compare pmf of daily time spent for age group 18-29 Vs 30-39 bins = [18, 30, 40, 50, 60, 70, 120] labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70+'] advertisement_data['agerange'] = pd.cut(advertisement_data.Age, bins, labels = labels,include_lowest = True) age_grp_30_to_39_ds=advertisement_data[advertisement_data.agerange=='30-39'] age_grp_18_to_29_ds=advertisement_data[advertisement_data.agerange=='18-29'] age_grp_30_to_39_pmf = thinkstats2.Pmf(age_grp_30_to_39_ds.Daily_Time_Spent, label='30-39') age_grp_18_to_29_pmf = thinkstats2.Pmf(age_grp_18_to_29_ds.Daily_Time_Spent, label='18-29')
# Exploratory Analysis of the Data # In[25]: width=0.45 axis = [27, 46, 0, 0.6] thinkplot.PrePlot(2, cols=2) thinkplot.Hist(timeduration_pmf, align='right', width=width) thinkplot.Hist(temperature_pmf, align='left', width=width) thinkplot.Config(xlabel='Time Duration comparison against Temperature)', ylabel='PMF', axis=axis) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([timeduration_pmf, temperature_pmf]) thinkplot.Config(xlabel='Point in Time duration (temperature)', axis=axis) # Additional Exploratory Analysis # In[26]: df.temperature.mean() #62.999 df.tripduration.mean() #11.44 mode(df.month) #8, People ride the most in August mode(df.week) #30 #People rode the most the 30th week of the month mode(df.day) #1, people rode the most on the first of the month mode(df.hour) #17, people mostly ride at 5PM
# NSFG respondent variable NUMKDHH to construction the actual distribution for the # number of children under 18 in the household resp = nsfg.ReadFemResp() pmf = thinkstats2.Pmf(resp.numkdhh, label='actual') # computer the biased distributoin we would see if surveyed the children and asked them how # many children including themselves are under 18 in their household def BiasPmf(pmf, label): new_pmf = pmf.Copy(label=label) for x, p in pmf.Items(): new_pmf.Mult(x, x) new_pmf.Normalize() return new_pmf #plot actual and observed distributions biased_pmf = BiasPmf(pmf, label='observed') thinkplot.PrePlot(2) thinkplot.Pmfs([pmf, biased_pmf]) thinkplot.Config(xlabel='Number of children', ylabel='pmf') thinkplot.show() # compute their means print('The actual mean is: ', "{:.2f}".format(pmf.Mean())) print('The biased mean is: ', "{:.2f}".format(biased_pmf.Mean()))
others = live[live.birthord != 1] first_pmf = thinkstats2.Pmf(firsts.prglngth) other_pmf = thinkstats2.Pmf(others.prglngth) ## Make comparison plot width = 0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(first_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Config(xlabel='weeks', ylabel='probability', axis=[27, 46, 0, 0.6]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([first_pmf, other_pmf]) thinkplot.Show(xlabel='weeks', axis=[27, 46, 0, 0.6]) ## Make bar chart comparison diffs = [] weeks = range(35, 46) for week in weeks: p1 = first_pmf[week] p2 = other_pmf[week] diff = (p1 - p2) * 100 diffs.append(diff) thinkplot.Bar(weeks, diffs) thinkplot.Show(xlabel='weeks', ylabel='diff - %') ## Class size paradox
# In[54]: thinkplot.Pmf(flfindistdfpmf) thinkplot.Pmf(vfindistdfpmf) # In[55]: thinkplot.PrePlot(2) thinkplot.subplot(2) #axis = [0, 800, 0, 0.0005] thinkplot.Pmfs([flfindistdfpmf,vfindistdfpmf ]) thinkplot.Show(xlabel = 'Total Revenue', ylabel = 'PMF') # # Lets plot PMF of log transformed columns # In[56]: findistdf.columns # In[57]: lgflfindistdfpmf = thinkstats2.Pmf(flfindistdf['lg_TOTALREV'], label='FLORIDA')
import nsfg import thinkstats2 import thinkplot import probability p = nsfg.ReadFemResp() act_pmf = thinkstats2.Pmf(p.numkdhh, label='actual') print(act_pmf) bias_pmf = probability.BiasPmf(act_pmf, label='observed') print(bias_pmf) print('Mean number of children, actual: ', act_pmf.Mean()) print('Mean number of children, biased: ', bias_pmf.Mean()) fig = thinkplot.Pmfs([act_pmf, bias_pmf]) #thinkplot.show(xlabel='No. of Children', ylabel='pmf') thinkplot.SaveFormat(root = 'act_vs_biased', fmt = 'png', xlabel = 'No. of Children', ylabel = 'pmf')