def MakePlot(self, root='redline2'): """Plots the computed CDFs. root: string """ print 'Mean z', self.pmf_z.Mean() / 60 print 'Mean zb', self.pmf_zb.Mean() / 60 print 'Mean y', self.pmf_y.Mean() / 60 cdf_z = self.pmf_z.MakeCdf() cdf_zb = self.pmf_zb.MakeCdf() cdf_y = self.pmf_y.MakeCdf() cdfs = ScaleDists([cdf_z, cdf_zb, cdf_y], 1.0 / 60) thinkplot.Clf() thinkplot.PrePlot(3) thinkplot.Cdfs(cdfs) thinkplot.Save(root=root, xlabel='Time (min)', ylabel='CDF', formats=FORMATS) root += 'a' pmfs = self.pmf_z, self.pmf_zb, self.pmf_y pmfs = ScaleDists(pmfs, 1.0 / 60) thinkplot.PrePlot(3) thinkplot.Pmfs(pmfs) thinkplot.Save(root=root, xlabel='Time (min)', ylabel='Probability', formats=FORMATS)
def MakePlots(player1, player2): """Generates two plots. price1 shows the priors for the two players price2 shows the distribution of diff for the two players """ # plot the prior distribution of price for both players MakePrice1(player1, player2) thinkplot.Save(root='price1', xlabel='price ($)', ylabel='PDF', formats=FORMATS) # plot the historical distribution of underness for both players thinkplot.Clf() thinkplot.PrePlot(num=2) cdf1 = player1.CdfDiff() cdf1.name = 'player 1' cdf2 = player2.CdfDiff() cdf2.name = 'player 2' print('Player median', cdf1.Percentile(50)) print('Player median', cdf2.Percentile(50)) print('Player 1 overbids', player1.ProbOverbid()) print('Player 2 overbids', player2.ProbOverbid()) thinkplot.Cdfs([cdf1, cdf2]) thinkplot.Save(root='price2', xlabel='diff ($)', ylabel='CDF', formats=FORMATS)
def MakeBabyBoom(): """Plot CDF of interarrival time on log and linear scales. """ # compute the interarrival times df = ReadBabyBoom() diffs = df.minutes.diff() cdf = thinkstats2.Cdf(diffs, label='actual') thinkplot.PrePlot(cols=2) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='minutes', ylabel='CDF', legend=False) thinkplot.SubPlot(2) thinkplot.Cdf(cdf, complement=True) thinkplot.Config(xlabel='minutes', ylabel='CCDF', yscale='log', legend=False) thinkplot.Save(root='analytic_interarrivals') n = len(diffs) lam = 44 / 24 * 60.0 sample = [random.expovariate(lam) for _ in range(n)] model = thinkstats2.Cdf(sample, label='model') thinkplot.PrePlot(2) thinkplot.Cdfs([cdf, model], complement=True) thinkplot.Save(root='analytic_interarrivals_model', title='Time between births', xlabel='minutes', ylabel='CCDF', yscale='log')
def main(): results = relay.ReadResults() speeds = relay.GetSpeeds(results) speeds = relay.BinData(speeds, 3, 12, 100) # plot the distribution of actual speeds pmf = thinkstats2.Pmf(speeds, 'actual speeds') # plot the biased distribution seen by the observer biased = ObservedPmf(pmf, 7.5, label='observed speeds') thinkplot.Pmf(biased) thinkplot.Save(root='observed_speeds', title='PMF of running speed', xlabel='speed (mph)', ylabel='PMF') cdf = thinkstats2.Cdf(pmf) cdf_biased = thinkstats2.Cdf(biased) thinkplot.PrePlot(2) thinkplot.Cdfs([cdf, cdf_biased]) thinkplot.Save(root='observed_speeds_cdf', title='CDF of running speed', xlabel='speed (mph)', ylabel='CDF')
def MakeCdfs(male, female): malecdf = thinkstats2.Cdf(male.alcwknd, label='Male') femalecdf = thinkstats2.Cdf(female.alcwknd, label='Female') thinkplot.PrePlot(2) thinkplot.Cdfs([malecdf, femalecdf]) thinkplot.Config(xlabel='Alcohol Consumed (grams)', ylabel='CDF', title='Weekend Alcohol Consumption') thinkplot.Show()
def MakeCdfs(male, female): malecdf = thinkstats2.Cdf(male.totalwgt_lb, label='Male') femalecdf = thinkstats2.Cdf(female.totalwgt_lb, label='Female') thinkplot.PrePlot(2) thinkplot.Cdfs([malecdf, femalecdf]) thinkplot.Config(xlabel='Baby Weight (Lbs)', ylabel='CDF', title='Baby Weights') thinkplot.Show()
def MakeFigures(pool, firsts, others): """Creates several figures for the book.""" # CDF of all ages thinkplot.Clf() thinkplot.Cdf(pool.age_cdf) thinkplot.Save(root='agemodel_age_cdf', title="Distribution of mother's age", xlabel='age (years)', ylabel='CDF', legend=False) # CDF of all weights thinkplot.Clf() thinkplot.Cdf(pool.weight_cdf) thinkplot.Save(root='agemodel_weight_cdf', title="Distribution of birth weight", xlabel='birth weight (oz)', ylabel='CDF', legend=False) # plot CDFs of birth ages for first babies and others thinkplot.Clf() thinkplot.Cdfs([firsts.age_cdf, others.age_cdf]) thinkplot.Save(root='agemodel_age_cdfs', title="Distribution of mother's age", xlabel='age (years)', ylabel='CDF') thinkplot.Clf() thinkplot.Cdfs([firsts.weight_cdf, others.weight_cdf]) thinkplot.Save(root='agemodel_weight_cdfs', title="Distribution of birth weight", xlabel='birth weight (oz)', ylabel='CDF') # make a scatterplot of ages and weights ages, weights = GetAgeWeight(pool) thinkplot.clf() thinkplot.Scatter(ages, weights, alpha=0.2) thinkplot.Save(root='agemodel_scatter', xlabel='Age (years)', ylabel='Birth weight (oz)', legend=False)
def MakePrice2(player1, player2): """ plot the historical distribution of underness for both players""" thinkplot.Clf() thinkplot.PrePlot(num=2) cdf1 = player1.CdfDiff() cdf1.name = 'player 1' cdf2 = player2.CdfDiff() cdf2.name = 'player 2' thinkplot.Cdfs([cdf1, cdf2])
def MakeCdfs(greq, less): greqcdf = thinkstats2.Cdf(greq.totalwgt_lb, label='greater/equal to 30') lesscdf = thinkstats2.Cdf(less.totalwgt_lb, label='less than 30') thinkplot.PrePlot(2) thinkplot.Cdfs([greqcdf, lesscdf]) thinkplot.Config(xlabel='Weight (lbs)', ylabel='CDF') thinkplot.Show() print 'Greater/equal to 30 50th percentile:', greqcdf.Percentile(50) print 'Less than 30 50th percentile:', lesscdf.Percentile(50)
def TestSample(live): weights = live.totalwgt_lb cdf = thinkstats2.Cdf(weights, label='totalwgt_lb') sample = cdf.Sample(1000) sample_cdf = thinkstats2.Cdf(sample, label='sample') thinkplot.PrePlot(2) thinkplot.Cdfs([cdf, sample_cdf]) thinkplot.Save(root='cumulative_sample', xlabel='weight (pounds)', ylabel='CDF')
def CH6_5(diff1, diff2): """ 两组展品的出价差的CDF累计分布 """ thinkplot.Clf() thinkplot.PrePlot(num=2) diff1_cdf = thinkbayes.MakeCdfFromList(diff1, name='diff1') diff2_cdf = thinkbayes.MakeCdfFromList(diff2, name='diff2') thinkplot.Cdfs([diff1_cdf, diff2_cdf]) thinkplot.Show(xlabel='diff $', ylabel="CDF") # 计算CDF(diff <= 0), 判断选手是否偏向低估商品 print(diff1_cdf.Prob(0), diff2_cdf.Prob(0))
def PlotOutliers(samples): """Make CDFs showing the distribution of outliers.""" cdfs = [] for label, sample in samples.iteritems(): outliers = [x for x in sample if x < 150] cdf = thinkbayes.MakeCdfFromList(outliers, label) cdfs.append(cdf) thinkplot.Clf() thinkplot.Cdfs(cdfs) thinkplot.Save(root='variability_cdfs', title='CDF of height', xlabel='Reported height (cm)', ylabel='CDF')
def TestSample(live): """Plots the distribution of weights against a random sample. live: DataFrame for live births """ weights = live.totalwgt_lb cdf = thinkstats2.Cdf(weights, label='totalwgt_lb') sample = cdf.Sample(1000) sample_cdf = thinkstats2.Cdf(sample, label='sample') thinkplot.PrePlot(2) thinkplot.Cdfs([cdf, sample_cdf]) thinkplot.Save(root='cumulative_sample', xlabel='weight (pounds)', ylabel='CDF')
def PlotPosteriors(self, other): """Plots posterior distributions of efficacy. self, other: Sat objects. """ thinkplot.Clf() thinkplot.PrePlot(num=2) cdf1 = thinkbayes2.Cdf(self, label='posterior %d' % self.score) cdf2 = thinkbayes2.Cdf(other, label='posterior %d' % other.score) thinkplot.Cdfs([cdf1, cdf2]) thinkplot.Save(xlabel='efficacy', ylabel='CDF', axis=[0, 4.6, 0.0, 1.0], root='sat_posteriors_eff', formats=['pdf', 'eps'])
def TestCorrelation(cdf): """Tests the correlated generator. Makes sure that the sequence has the right distribution and correlation. """ n = 10000 rho = 0.4 rdt_seq = CorrelatedGenerator(cdf, rho) xs = [rdt_seq.next() for _ in range(n)] rho2 = correlation.SerialCorr(xs) print(rho, rho2) cdf2 = thinkbayes.MakeCdfFromList(xs) thinkplot.Cdfs([cdf, cdf2]) thinkplot.Show()
def MakePlot(self, root='redline1'): """Plot the prior and posterior CDF of passengers arrival rate. root: string """ thinkplot.Clf() thinkplot.PrePlot(2) # convert units to passengers per minute prior = self.prior_lam.MakeCdf().Scale(60) post = self.post_lam.MakeCdf().Scale(60) thinkplot.Cdfs([prior, post]) thinkplot.Save(root=root, xlabel='Arrival rate (passengers / min)', ylabel='CDF', formats=FORMATS)
def compareAlaskaAirlinesCdf(alaska, others): """Create CDF to compare Alaska Airlines versus other airlines Per JD Power: Among traditional carriers, Alaska Airlines ranks highest for the 12th consecutive year https://www.jdpower.com/business/press-releases/2019-north-america-airline-satisfaction-study """ # plot CDFs of arrival delays for alaska airlines and others alaska_cdf = thinkstats2.Cdf(alaska.ARRIVAL_DELAY, label='Alaska Airlines') other_cdf = thinkstats2.Cdf(others.ARRIVAL_DELAY, label='other') thinkplot.PrePlot(2) thinkplot.Cdfs([alaska_cdf, other_cdf]) # thinkplot.Show(xlabel='arrival delay (min)', ylabel='CDF', axis=[-20, 40, 0, 1]) thinkplot.Save(root='AlaskaAirlines_ArrivalDelay_cdf', title='Arrival delay', xlabel='arrival delay (min)', ylabel='CDF', axis=[-20, 40, 0, 1])
def MakeFigures(live, firsts, others): """Creates several figures for the book. live: DataFrame firsts: DataFrame others: DataFrame """ first_wgt = firsts.totalwgt_lb first_wgt_dropna = first_wgt.dropna() print('Firsts', len(first_wgt), len(first_wgt_dropna)) #assert len(first_wgt_dropna) == 4381 other_wgt = others.totalwgt_lb other_wgt_dropna = other_wgt.dropna() print('Others', len(other_wgt), len(other_wgt_dropna)) #assert len(other_wgt_dropna) == 4706 first_pmf = thinkstats2.Pmf(first_wgt_dropna, label='first') other_pmf = thinkstats2.Pmf(other_wgt_dropna, label='other') width = 0.4 / 16 # plot PMFs of birth weights for first babies and others thinkplot.PrePlot(2) thinkplot.Hist(first_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Save(root='cumulative_birthwgt_pmf', title='Birth weight', xlabel='weight (pounds)', ylabel='PMF') # plot CDFs of birth weights for first babies and others first_cdf = thinkstats2.Cdf(firsts.totalwgt_lb, label='first') other_cdf = thinkstats2.Cdf(others.totalwgt_lb, label='other') thinkplot.PrePlot(2) thinkplot.Cdfs([first_cdf, other_cdf]) thinkplot.Save(root='cumulative_birthwgt_cdf', title='Birth weight', xlabel='weight (pounds)', ylabel='CDF', axis=[0, 12.5, 0, 1] )
def MakePlot(self, root='redline3'): """Plot the CDFs. root: string """ # observed gaps cdf_prior_x = self.prior_x.MakeCdf() cdf_post_x = self.post_x.MakeCdf() cdf_y = self.pmf_y.MakeCdf() cdfs = ScaleDists([cdf_prior_x, cdf_post_x, cdf_y], 1.0 / 60) thinkplot.Clf() thinkplot.PrePlot(3) thinkplot.Cdfs(cdfs) thinkplot.Save(root=root, xlabel='Time (min)', ylabel='CDF', formats=FORMATS)
def PlotConditionalCdfs(self): """Plots the cdf of ages for each bucket.""" buckets = [7.0, 16.0, 23.0, 27.0] # 2.01, 4.95 cm, 9.97 cm, 14.879 cm names = ['2 cm', '5 cm', '10 cm', '15 cm'] cdfs = [] for bucket, name in zip(buckets, names): cdf = self.cache.ConditionalCdf(bucket, name) cdfs.append(cdf) thinkplot.Clf() thinkplot.PrePlot(num=len(cdfs)) thinkplot.Cdfs(cdfs) thinkplot.Save(root='kidney6', title='Distribution of age for several diameters', formats=FORMATS, xlabel='tumor age (years)', ylabel='CDF', loc=4)
def PlotPosteriors(): thinkbayes.RandomSeed(18) data1 = FakeData(100, 0.03) data2 = FakeData(100, 0.05) pmf1 = MakePosterior(data1, name="headline a") pmf2 = MakePosterior(data2, name="headline b") lt = pmf1 < pmf2 eq = pmf1 == pmf2 gt = pmf1 > pmf2 print lt + eq / 2 print gt + eq / 2 cdf1 = pmf1.MakeCdf() cdf2 = pmf2.MakeCdf() thinkplot.PrePlot(num=2) thinkplot.Cdfs([cdf1, cdf2]) thinkplot.Show(axis=[0, 0.2, 0, 1])
width=0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(wins_pmf, align='right', width=width) thinkplot.Hist(runs_pmf, align='left', width=width) thinkplot.Config(xlabel='Result', ylabel='PMF') # In[30]: # Building the CDFs of both the wins and the runs wins_cdf = thinkstats2.Cdf(wins, label='Wins') runs_cdf = thinkstats2.Cdf(runs, label='Runs') thinkplot.PrePlot(2) thinkplot.Cdfs([wins_cdf, runs_cdf]) thinkplot.Config(xlabel='Result', ylabel='CDF') # In[32]: (data.W.values) # I will use these numpy arrays later, that's why i created them here. # In[34]: (data.R.values) # In[46]:
# anim = viewer.animate(frames=100) # # plt.show() RandomSeed(17) env = Sugarscape(50, num_agents=250, min_lifespan=60, max_lifespan=100, replace=True) cdfs = [] for i in range(5): [env.step() for i in range(100)] cdf = Cdf(agent.sugar for agent in env.agents) cdfs.append(cdf) thinkplot.preplot(cols=2) thinkplot.Cdfs(cdfs[:-1], color='gray', alpha=0.3) thinkplot.Cdf(cdfs[-1]) thinkplot.config(xlabel='Wealth', ylabel='CDF') thinkplot.bigger_text() thinkplot.subplot(2) thinkplot.Cdfs(cdfs[:-1], color='gray', alpha=0.3) thinkplot.Cdf(cdfs[-1]) thinkplot.config(xlabel='Wealth', ylabel='CDF', xscale='log') thinkplot.bigger_text() thinkplot.save('chap09-4')
thinkplot.show(xlabel='weeks', ylabel='CDF') #%% print("10% {0} weeks".format(cdf.Value(0.1))) print("90% {0} weeks".format(cdf.Value(0.9))) #%% [markdown] # ## 4.5 CDFを比較する #%% first_cdf = thinkstats2.Cdf(firsts.totalwgt_lb, label='first') other_cdf = thinkstats2.Cdf(others.totalwgt_lb, label='other') thinkplot.PrePlot(2) thinkplot.Cdfs([first_cdf, other_cdf]) thinkplot.Show(xlabel='weight (pounds)', ylabel='CDF') #%% [markdown] # ## 4.6 パーセンタイル派生統計量 # - 中央値(median):50位パーセンタイル値 # - 四分位範囲(interquartile range, IQR):75位 - 25位パーセンタイル値 # - 分位数(quantiles):CDFにおいて等間隔で表現される統計量 #%% [markdown] # ## 4.7 乱数 #%% import numpy as np weights = live.totalwgt_lb
axis = [10000, 70000, 0, 0.01] thinkplot.PrePlot(2) #thinkplot.SubPlot(2) thinkplot.Pmfs([clicked_pmf, nonclicked_pmf]) thinkplot.Config(xlabel='Area Income', axis=axis) thinkplot.show() ############################################################################ #############################Section 3 -CDF################################# ############################################################################ age_grp_30_to_39_cdf = thinkstats2.Cdf(age_grp_30_to_39_ds.Daily_Time_Spent, label='30-39') age_grp_18_to_29_cdf = thinkstats2.Cdf(age_grp_18_to_29_ds.Daily_Time_Spent, label='18-29') thinkplot.PrePlot(2) thinkplot.Cdfs([age_grp_30_to_39_cdf, age_grp_18_to_29_cdf]) thinkplot.Config(xlabel='Daily Time Spent in minutes', ylabel='CDF') thinkplot.show() male_cdf = thinkstats2.Cdf(male_ds.Daily_Time_Spent, label='male') female_cdf = thinkstats2.Cdf(female_ds.Daily_Time_Spent, label='female') thinkplot.PrePlot(2) thinkplot.Cdfs([male_cdf, female_cdf]) thinkplot.Config(xlabel='Daily Time Spent in minutes', ylabel='CDF') thinkplot.show() ################################################################################################## ############################# Section 4 -Analytical Distribution #################################
from code import * import thinkplot y2 = y1.flatten() pmf_scores = thinkstats2.Pmf(y2) thinkplot.Hist(pmf_scores) thinkplot.Config(xlabel='Runs Scored', ylabel='probability', axis=[0, 20, 0, 0.3]) cdf_scores = thinkstats2.Cdf(y2, label='Runs Scored') cdf_ld = thinkstats2.Cdf(X3['bat_LD%'], label='Line Drives') cdf_pop = thinkstats2.Cdf(X3['bat_POP%'], label='Pop Ups') cdf_gb = thinkstats2.Cdf(X3['bat_GB%'], label='Ground Balls') thinkplot.PrePlot(4) thinkplot.Cdfs([cdf_scores, cdf_ld, cdf_pop, cdf_gb]) thinkplot.Show(xlabel='balls in play (%)', ylabel='CDF') # Visualizing data in One Dimension (1-D) import matplotlib.pyplot as plt y.hist(bins=15, color='steelblue', edgecolor='black', linewidth=1.0, xlabelsize=8, ylabelsize=8, grid=False) plt.tight_layout(rect=(0, 15, 0, 15)) # visualizing one of the continuous, numeric attributes # Histogram fig = plt.figure(figsize = (10,4)) title = fig.suptitle("Runs", fontsize=14) fig.subplots_adjust(top=0.85, wspace=0.1) ax = fig.add_subplot(1,1, 1)
def WeightDiffInFirstOther(first, other): first_cdf = thinkstats2.Cdf(first.totalwgt_lb, label='First') other_cdf = thinkstats2.Cdf(other.totalwgt_lb, label='Other') thinkplot.PrePlot(2) thinkplot.Cdfs([first_cdf, other_cdf]) thinkplot.Show(xlabel='Weight (pounds)', ylabel='CDF')
thinkplot.Config(xlabel='trip duration (minutes)', ylabel='CDF', loc='upper left') # In[33]: chilly_tripduration_cdf = thinkstats2.Cdf(chilly_df.tripduration, label='chilly trip duration') thinkplot.Cdf(chilly_tripduration_cdf) thinkplot.Config(xlabel='trip duration (minutes)', ylabel='CDF', loc='upper left') # In[34]: #comparison thinkplot.Cdfs([chilly_tripduration_cdf,warm_tripduration_cdf]) thinkplot.Show(xlabel='trip duration (minutes)',ylabel='CDF') # By comparing colder and warmer temperatures with their duration, we can see that chilly bike rides are slightly shorter than warmer bike rides # Analytical Distribution # In[35]: #NORMAL CDF to for visual thinkplot.PrePlot(3) mus = [1.0, 2.0, 3.0] sigmas = [0.5, 0.4, 0.3]
over_three_hr = moving_time[moving_time > 10800] less_three_hr = moving_time[moving_time <= 10800] pmf_more = thinkstats2.Pmf(over_three_hr, label="More Than Three HR") pmf_less = thinkstats2.Pmf(less_three_hr, label='Less Than Three HR') pmf_stuff(1, 8000, 22000, 0, pmf_more, pmf_less, 'Ride Length (Min)', 0.02) ########### PART SIX ############ cdf = thinkstats2.Cdf(moving_time, label='Moving Time') thinkplot.Cdf(cdf) thinkplot.Show(xlabel='Moving Time in Min', ylabel='CDF') more_cdf = thinkstats2.Cdf(over_one_hr, label='Over Than One Hr') less_cdf = thinkstats2.Cdf(less_one_hr, label='Less Than One Hr') thinkplot.PrePlot(2) thinkplot.Cdfs([more_cdf, less_cdf]) thinkplot.Show(xlabel='Moving Time (Min)', ylabel='CDF') ########### PART SEVEN ############ avg_watts = average_watts.dropna() def MakeNormalModel(data, label): cdf = thinkstats2.Cdf(data, label=label) mean, var = thinkstats2.TrimmedMeanVar(data) std = np.sqrt(var) print('n, mean, std', len(data), mean, std) xmin = mean - 4 * std xmax = mean + 4 * std