def MakeFigures(df): """Generates CDFs and normal prob plots for weights and log weights.""" weights = df.wtkg2.dropna() log_weights = np.log10(weights) # plot weights on linear and log scales thinkplot.PrePlot(cols=2) MakeNormalModel(weights) thinkplot.Config(xlabel='adult weight (kg)', ylabel='CDF') thinkplot.SubPlot(2) MakeNormalModel(log_weights) thinkplot.Config(xlabel='adult weight (log10 kg)') thinkplot.Save(root='brfss_weight') # make normal probability plots on linear and log scales thinkplot.PrePlot(cols=2) MakeNormalPlot(weights) thinkplot.Config(xlabel='z', ylabel='weights (kg)') thinkplot.SubPlot(2) MakeNormalPlot(log_weights) thinkplot.Config(xlabel='z', ylabel='weights (log10 kg)') thinkplot.Save(root='brfss_weight_normal')
def MakeHists(live): """Plot Hists for live births live: DataFrame others: DataFrame """ hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb') thinkplot.Hist(hist) thinkplot.Save(root='first_wgt_lb_hist', xlabel='pounds', ylabel='frequency', axis=[-1, 14, 0, 3200]) hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz') thinkplot.Hist(hist) thinkplot.Save(root='first_wgt_oz_hist', xlabel='ounces', ylabel='frequency', axis=[-1, 16, 0, 1200]) hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg') thinkplot.Hist(hist) thinkplot.Save(root='first_agepreg_hist', xlabel='years', ylabel='frequency') hist = thinkstats2.Hist(live.prglngth, label='prglngth') thinkplot.Hist(hist) thinkplot.Save(root='first_prglngth_hist', xlabel='weeks', ylabel='frequency', axis=[-1, 53, 0, 5000])
def PlotPregnancyData(preg): """Plots survival and hazard curves based on pregnancy lengths. preg: Outcome codes from http://www.icpsr.umich.edu/nsfg6/Controller? displayPage=labelDetails&fileCode=PREG§ion=&subSec=8016&srtLabel=611932 1 LIVE BIRTH 9148 2 INDUCED ABORTION 1862 3 STILLBIRTH 120 4 MISCARRIAGE 1921 5 ECTOPIC PREGNANCY 190 6 CURRENT PREGNANCY 352 """ complete = preg.query('outcome in [1, 3, 4]').prglngth print('Number of complete pregnancies', len(complete)) ongoing = preg[preg.outcome == 6].prglngth print('Number of ongoing pregnancies', len(ongoing)) PlotSurvival(complete) thinkplot.Save(root='survival1', xlabel='t (weeks)', formats=FORMATS) hf = EstimateHazardFunction(complete, ongoing) sf = hf.MakeSurvival() return sf
def RunTests(data, iters=1000): """Runs several tests on the given data. data: pair of sequences iters: number of iterations to run """ # test the difference in means ht = DiffMeansPermute(data) p_value = ht.PValue(iters=iters) print('\nmeans permute two-sided') PrintTest(p_value, ht) ht.PlotCdf() thinkplot.Save(root='hypothesis1', title='Permutation test', xlabel='difference in means (weeks)', ylabel='CDF', legend=False) # test the difference in means one-sided ht = DiffMeansOneSided(data) p_value = ht.PValue(iters=iters) print('\nmeans permute one-sided') PrintTest(p_value, ht) # test the difference in std ht = DiffStdPermute(data) p_value = ht.PValue(iters=iters) print('\nstd permute one-sided') PrintTest(p_value, ht)
def ResampleSurvival(resp, iters=101): """Resamples respondents and estimates the survival function. resp: DataFrame of respondents iters: number of resamples """ _, sf = EstimateMarriageSurvival(resp) thinkplot.Plot(sf) low, high = resp.agemarry.min(), resp.agemarry.max() ts = np.arange(low, high, 1 / 12.0) ss_seq = [] for _ in range(iters): sample = thinkstats2.ResampleRowsWeighted(resp) _, sf = EstimateMarriageSurvival(sample) ss_seq.append(sf.Probs(ts)) low, high = thinkstats2.PercentileRows(ss_seq, [5, 95]) thinkplot.FillBetween(ts, low, high, color='gray', label='90% CI') thinkplot.Save(root='survival3', xlabel='age (years)', ylabel='prob unmarried', xlim=[12, 46], ylim=[0, 1], formats=FORMATS)
def MakeParetoCdf2(): """Generates a plot of the CDF of height in Pareto World.""" xmin = 100 alpha = 1.7 xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha, 0, 1000.0, n=100) thinkplot.Plot(xs, ps) thinkplot.Save(root='analytic_pareto_height', title='Pareto CDF', xlabel='height (cm)', ylabel='CDF', legend=False)
def MakeExpoCdf(): """Generates a plot of the exponential CDF.""" thinkplot.PrePlot(3) for lam in [2.0, 1, 0.5]: xs, ps = thinkstats2.RenderExpoCdf(lam, 0, 3.0, 50) label = r'$\lambda=%g$' % lam thinkplot.Plot(xs, ps, label=label) thinkplot.Save(root='analytic_expo_cdf', title='Exponential CDF', xlabel='x', ylabel='CDF')
def MakeParetoCdf(): """Generates a plot of the Pareto CDF.""" xmin = 0.5 thinkplot.PrePlot(3) for alpha in [2.0, 1.0, 0.5]: xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha, 0, 10.0, n=100) thinkplot.Plot(xs, ps, label=r'$\alpha=%g$' % alpha) thinkplot.Save(root='analytic_pareto_cdf', title='Pareto CDF', xlabel='x', ylabel='CDF')
def main(): thinkstats2.RandomSeed(17) preg = nsfg.ReadFemPreg() sf1 = PlotPregnancyData(preg) # make the plots based on Cycle 6 resp6 = ReadFemResp2002() sf2 = PlotMarriageData(resp6) ResampleSurvival(resp6) PlotRemainingLifetime(sf1, sf2) # read Cycles 5 and 7 resp5 = ReadFemResp1995() resp7 = ReadFemResp2010() # plot resampled survival functions by decade resps = [resp5, resp6, resp7] PlotResampledByDecade(resps) thinkplot.Save(root='survival4', xlabel='age (years)', ylabel='prob unmarried', xlim=[13, 45], ylim=[0, 1], formats=FORMATS) # plot resampled survival functions by decade, with predictions PlotResampledByDecade(resps, predict_flag=True, omit=[5]) thinkplot.Save(root='survival5', xlabel='age (years)', ylabel='prob unmarried', xlim=[13, 45], ylim=[0, 1], formats=FORMATS)
def MakeExampleNormalPlot(): """Generates a sample normal probability plot. """ n = 1000 thinkplot.PrePlot(3) mus = [0, 1, 5] sigmas = [1, 1, 2] for mu, sigma in zip(mus, sigmas): sample = np.random.normal(mu, sigma, n) xs, ys = thinkstats2.NormalProbability(sample) label = '$\mu=%d$, $\sigma=%d$' % (mu, sigma) thinkplot.Plot(xs, ys, label=label) thinkplot.Save(root='analytic_normal_prob_example', title='Normal probability plot', xlabel='standard normal sample', ylabel='sample values')
def MakeComparison(firsts, others): """Plots histograms of pregnancy length for first babies and others. firsts: DataFrame others: DataFrame """ first_hist = thinkstats2.Hist(firsts.prglngth, label='first') other_hist = thinkstats2.Hist(others.prglngth, label='other') width = 0.45 thinkplot.PrePlot(2) thinkplot.Hist(first_hist, align='right', width=width) thinkplot.Hist(other_hist, align='left', width=width) thinkplot.Save(root='first_nsfg_hist', title='Histogram', xlabel='weeks', ylabel='frequency', axis=[27, 46, 0, 2700])
def MakeNormalCdf(): """Generates a plot of the normal CDF.""" thinkplot.PrePlot(3) mus = [1.0, 2.0, 3.0] sigmas = [0.5, 0.4, 0.3] for mu, sigma in zip(mus, sigmas): xs, ps = thinkstats2.RenderNormalCdf(mu=mu, sigma=sigma, low=-1.0, high=4.0) label = r'$\mu=%g$, $\sigma=%g$' % (mu, sigma) thinkplot.Plot(xs, ps, label=label) thinkplot.Save(root='analytic_normal_cdf', title='Normal CDF', xlabel='x', ylabel='CDF', loc=2)
def MakeBabyBoom(): """Plot CDF of interarrival time on log and linear scales. """ # compute the interarrival times df = ReadBabyBoom() diffs = df.minutes.diff() cdf = thinkstats2.Cdf(diffs, label='actual') thinkplot.PrePlot(cols=2) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='minutes', ylabel='CDF', legend=False) thinkplot.SubPlot(2) thinkplot.Cdf(cdf, complement=True) thinkplot.Config(xlabel='minutes', ylabel='CCDF', yscale='log', legend=False) thinkplot.Save(root='analytic_interarrivals', legend=False)
def MakeNormalPlot(weights, term_weights): """Generates a normal probability plot of birth weights.""" mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01) std = math.sqrt(var) xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, mean, std) thinkplot.Plot(fxs, fys, linewidth=4, color='0.8') thinkplot.PrePlot(2) xs, ys = thinkstats2.NormalProbability(weights) thinkplot.Plot(xs, ys, label='all live') xs, ys = thinkstats2.NormalProbability(term_weights) thinkplot.Plot(xs, ys, label='full term') thinkplot.Save(root='analytic_birthwgt_normal', title='Normal probability plot', xlabel='Standard deviations from mean', ylabel='Birth weight (lbs)')
def PlotMarriageData(resp): """Plots hazard and survival functions. resp: DataFrame of respondents """ hf, sf = EstimateMarriageSurvival(resp) thinkplot.PrePlot(rows=2) thinkplot.Plot(hf) thinkplot.Config(ylabel='hazard', legend=False) thinkplot.SubPlot(2) thinkplot.Plot(sf) thinkplot.Save(root='survival2', xlabel='age (years)', ylabel='prob unmarried', ylim=[0, 1], legend=False, formats=FORMATS) return sf
def PrintExtremes(live): """Plots the histogram of pregnancy lengths and prints the extremes. live: DataFrame of live births """ hist = thinkstats2.Hist(live.prglngth) thinkplot.Hist(hist, label='live births') thinkplot.Save(root='first_nsfg_hist_live', title='Histogram', xlabel='weeks', ylabel='frequency') print('Shortest lengths:') for weeks, freq in hist.Smallest(10): print(weeks, freq) print('Longest lengths:') for weeks, freq in hist.Largest(10): print(weeks, freq)
def MakeNormalModel(weights): """Plot the CDF of birthweights with a normal model.""" # estimate parameters: trimming outliers yields a better fit mu, var = thinkstats2.TrimmedMeanVar(weights, p=0.01) print('Mean, Var', mu, var) # plot the model sigma = math.sqrt(var) print('Sigma', sigma) xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=12.5) thinkplot.Plot(xs, ps, label='model', color='0.8') # plot the data cdf = thinkstats2.Cdf(weights, label='data') thinkplot.PrePlot(1) thinkplot.Cdf(cdf) thinkplot.Save(root='analytic_birthwgt_model', title='Birth weights', xlabel='birth weight (lbs)', ylabel='CDF')
def PlotRemainingLifetime(sf1, sf2): """Plots remaining lifetimes for pregnancy and age at first marriage. sf1: SurvivalFunction for pregnancy length sf2: SurvivalFunction for age at first marriage """ thinkplot.PrePlot(cols=2) rem_life1 = sf1.RemainingLifetime() thinkplot.Plot(rem_life1) thinkplot.Config(title='remaining pregnancy length', xlabel='weeks', ylabel='mean remaining weeks') thinkplot.SubPlot(2) func = lambda pmf: pmf.Percentile(50) rem_life2 = sf2.RemainingLifetime(filler=np.inf, func=func) thinkplot.Plot(rem_life2) thinkplot.Config(title='years until first marriage', ylim=[0, 15], xlim=[11, 31], xlabel='age (years)', ylabel='median remaining years') thinkplot.Save(root='survival6', formats=FORMATS)