def MakeNormalPlot(weights): """Generates a normal probability plot of birth weights. weights: sequence """ mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01) std = math.sqrt(var) xs = [-5, 5] xs, ys = thinkstats2.FitLine(xs, mean, std) thinkplot.Plot(xs, ys, color='0.8', label='model') xs, ys = thinkstats2.NormalProbability(weights) thinkplot.Plot(xs, ys, label='weights')
def main(filename='mystery0.dat'): data = ReadFile(filename) cdf = thinkstats2.Cdf(data) thinkplot.PrePlot(rows=2, cols=3) thinkplot.SubPlot(1) thinkplot.Cdf(cdf) thinkplot.Config(title='linear') thinkplot.SubPlot(2) scale = thinkplot.Cdf(cdf, xscale='log') thinkplot.Config(title='logx', **scale) thinkplot.SubPlot(3) scale = thinkplot.Cdf(cdf, transform='exponential') thinkplot.Config(title='expo', **scale) thinkplot.SubPlot(4) xs, ys = thinkstats2.NormalProbability(data) thinkplot.Plot(xs, ys) thinkplot.Config(title='normal') thinkplot.SubPlot(5) scale = thinkplot.Cdf(cdf, transform='pareto') thinkplot.Config(title='pareto', **scale) thinkplot.SubPlot(6) scale = thinkplot.Cdf(cdf, transform='weibull') thinkplot.Config(title='weibull', **scale) thinkplot.Show(legend=False)
def ResampleSurvival(resp, iters=101): """Resamples respondents and estimates the survival function. resp: DataFrame of respondents iters: number of resamples """ _, sf = EstimateMarriageSurvival(resp) thinkplot.Plot(sf) low, high = resp.agemarry.min(), resp.agemarry.max() ts = np.arange(low, high, 1 / 12.0) ss_seq = [] for _ in range(iters): sample = thinkstats2.ResampleRowsWeighted(resp) _, sf = EstimateMarriageSurvival(sample) ss_seq.append(sf.Probs(ts)) low, high = thinkstats2.PercentileRows(ss_seq, [5, 95]) thinkplot.FillBetween(ts, low, high, color='gray', label='90% CI') thinkplot.Save(root='survival3', xlabel='age (years)', ylabel='prob unmarried', xlim=[12, 46], ylim=[0, 1], formats=FORMATS)
def EstimateMarriageSurvivalByDecade(groups, **options): """Groups respondents by decade and plots survival curves. groups: GroupBy object """ thinkplot.PrePlot(len(groups)) for _, group in groups: _, sf = EstimateMarriageSurvival(group) thinkplot.Plot(sf, **options)
def AddLabelsByDecade(groups, **options): """Draws fake points in order to add labels to the legend. groups: GroupBy object """ thinkplot.PrePlot(len(groups)) for name, _ in groups: label = '%d0s' % name thinkplot.Plot([15], [1], label=label, **options)
def MakeFigures(df): """Plots the CDF of income in several forms. """ xs, ps = df.income.values, df.ps.values cdf = SmoothCdf(xs, ps, label='data') cdf_log = SmoothCdf(np.log10(xs), ps, label='data') # linear plot thinkplot.Cdf(cdf) thinkplot.show(root='hinc_linear', xlabel='household income', ylabel='CDF') # pareto plot # for the model I chose parameters by hand to fit the tail xs, ys = thinkstats2.RenderParetoCdf(xmin=55000, alpha=2.5, low=0, high=250000) thinkplot.Plot(xs, 1 - ys, label='model', color='0.8') thinkplot.Cdf(cdf, complement=True) thinkplot.show(root='hinc_pareto', xlabel='log10 household income', ylabel='CCDF', xscale='log', yscale='log') # lognormal plot # for the model I estimate mu and sigma using # percentile-based statistics median = cdf_log.Percentile(50) iqr = cdf_log.Percentile(75) - cdf_log.Percentile(25) std = iqr / 1.349 # choose std to match the upper tail std = 0.35 print(median, std) xs, ps = thinkstats2.RenderNormalCdf(median, std, low=3.5, high=5.5) thinkplot.Plot(xs, ps, label='model', color='0.8') thinkplot.Cdf(cdf_log) thinkplot.show(root='hinc_normal', xlabel='log10 household income', ylabel='CDF')
def MakeNormalPlot(weights, term_weights): """Generates a normal probability plot of birth weights.""" mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01) std = math.sqrt(var) xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, mean, std) thinkplot.Plot(fxs, fys, linewidth=4, color='0.8') thinkplot.PrePlot(2) xs, ys = thinkstats2.NormalProbability(weights) thinkplot.Plot(xs, ys, label='all live') xs, ys = thinkstats2.NormalProbability(term_weights) thinkplot.Plot(xs, ys, label='full term') thinkplot.Save(root='analytic_birthwgt_normal', title='Normal probability plot', xlabel='Standard deviations from mean', ylabel='Birth weight (lbs)')
def PlotMarriageData(resp): """Plots hazard and survival functions. resp: DataFrame of respondents """ hf, sf = EstimateMarriageSurvival(resp) thinkplot.PrePlot(rows=2) thinkplot.Plot(hf) thinkplot.Config(ylabel='hazard', legend=False) thinkplot.SubPlot(2) thinkplot.Plot(sf) thinkplot.Save(root='survival2', xlabel='age (years)', ylabel='prob unmarried', ylim=[0, 1], legend=False, formats=FORMATS) return sf
def MakeParetoCdf2(): """Generates a plot of the CDF of height in Pareto World.""" xmin = 100 alpha = 1.7 xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha, 0, 1000.0, n=100) thinkplot.Plot(xs, ps) thinkplot.Save(root='analytic_pareto_height', title='Pareto CDF', xlabel='height (cm)', ylabel='CDF', legend=False)
def PlotSurvival(complete): """Plots survival and hazard curves. complete: list of complete lifetimes """ thinkplot.PrePlot(3, rows=2) cdf = thinkstats2.Cdf(complete, label='cdf') sf = MakeSurvivalFromCdf(cdf, label='survival') print(cdf[13]) print(sf[13]) thinkplot.Plot(sf) thinkplot.Cdf(cdf, alpha=0.2) thinkplot.Config() thinkplot.SubPlot(2) hf = sf.MakeHazardFunction(label='hazard') print(hf[39]) thinkplot.Plot(hf) thinkplot.Config(ylim=[0, 0.75])
def PlotHazard(complete, ongoing): """Plots the hazard function and survival function. complete: list of complete lifetimes ongoing: list of ongoing lifetimes """ # plot S(t) based on only complete pregnancies sf = MakeSurvivalFromSeq(complete) thinkplot.Plot(sf, label='old S(t)', alpha=0.1) thinkplot.PrePlot(2) # plot the hazard function hf = EstimateHazardFunction(complete, ongoing) thinkplot.Plot(hf, label='lams(t)', alpha=0.5) # plot the survival function sf = hf.MakeSurvival() thinkplot.Plot(sf, label='S(t)') thinkplot.Show(xlabel='t (weeks)')
def MakeParetoCdf(): """Generates a plot of the Pareto CDF.""" xmin = 0.5 thinkplot.PrePlot(3) for alpha in [2.0, 1.0, 0.5]: xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha, 0, 10.0, n=100) thinkplot.Plot(xs, ps, label=r'$\alpha=%g$' % alpha) thinkplot.Save(root='analytic_pareto_cdf', title='Pareto CDF', xlabel='x', ylabel='CDF')
def MakeExpoCdf(): """Generates a plot of the exponential CDF.""" thinkplot.PrePlot(3) for lam in [2.0, 1, 0.5]: xs, ps = thinkstats2.RenderExpoCdf(lam, 0, 3.0, 50) label = r'$\lambda=%g$' % lam thinkplot.Plot(xs, ps, label=label) thinkplot.Save(root='analytic_expo_cdf', title='Exponential CDF', xlabel='x', ylabel='CDF')
def PlotRemainingLifetime(sf1, sf2): """Plots remaining lifetimes for pregnancy and age at first marriage. sf1: SurvivalFunction for pregnancy length sf2: SurvivalFunction for age at first marriage """ thinkplot.PrePlot(cols=2) rem_life1 = sf1.RemainingLifetime() thinkplot.Plot(rem_life1) thinkplot.Config(title='remaining pregnancy length', xlabel='weeks', ylabel='mean remaining weeks') thinkplot.SubPlot(2) func = lambda pmf: pmf.Percentile(50) rem_life2 = sf2.RemainingLifetime(filler=np.inf, func=func) thinkplot.Plot(rem_life2) thinkplot.Config(title='years until first marriage', ylim=[0, 15], xlim=[11, 31], xlabel='age (years)', ylabel='median remaining years') thinkplot.Save(root='survival6', formats=FORMATS)
def PlotPredictionsByDecade(groups, **options): """Groups respondents by decade and plots survival curves. groups: GroupBy object """ hfs = [] for _, group in groups: hf, sf = EstimateMarriageSurvival(group) hfs.append(hf) thinkplot.PrePlot(len(hfs)) for i, hf in enumerate(hfs): if i > 0: hf.Extend(hfs[i - 1]) sf = hf.MakeSurvival() thinkplot.Plot(sf, **options)
def PlotConditionalSurvival(durations): """Plots conditional survival curves for a range of t0. durations: list of durations """ pmf = thinkstats2.Pmf(durations) times = [8, 16, 24, 32] thinkplot.PrePlot(len(times)) for t0 in times: sf = ConditionalSurvival(pmf, t0) label = 't0=%d' % t0 thinkplot.Plot(sf, label=label) thinkplot.Show()
def MakeNormalModel(weights): """Plots a CDF with a Normal model. weights: sequence """ cdf = thinkstats2.Cdf(weights, label='weights') mean, var = thinkstats2.TrimmedMeanVar(weights) std = math.sqrt(var) print('n, mean, std', len(weights), mean, std) xmin = mean - 4 * std xmax = mean + 4 * std xs, ps = thinkstats2.RenderNormalCdf(mean, std, xmin, xmax) thinkplot.Plot(xs, ps, label='model', linewidth=4, color='0.8') thinkplot.Cdf(cdf)
def MakeExampleNormalPlot(): """Generates a sample normal probability plot. """ n = 1000 thinkplot.PrePlot(3) mus = [0, 1, 5] sigmas = [1, 1, 2] for mu, sigma in zip(mus, sigmas): sample = np.random.normal(mu, sigma, n) xs, ys = thinkstats2.NormalProbability(sample) label = '$\mu=%d$, $\sigma=%d$' % (mu, sigma) thinkplot.Plot(xs, ys, label=label) thinkplot.Save(root='analytic_normal_prob_example', title='Normal probability plot', xlabel='standard normal sample', ylabel='sample values')
def MakeNormalCdf(): """Generates a plot of the normal CDF.""" thinkplot.PrePlot(3) mus = [1.0, 2.0, 3.0] sigmas = [0.5, 0.4, 0.3] for mu, sigma in zip(mus, sigmas): xs, ps = thinkstats2.RenderNormalCdf(mu=mu, sigma=sigma, low=-1.0, high=4.0) label = r'$\mu=%g$, $\sigma=%g$' % (mu, sigma) thinkplot.Plot(xs, ps, label=label) thinkplot.Save(root='analytic_normal_cdf', title='Normal CDF', xlabel='x', ylabel='CDF', loc=2)
def MakeNormalModel(weights): """Plot the CDF of birthweights with a normal model.""" # estimate parameters: trimming outliers yields a better fit mu, var = thinkstats2.TrimmedMeanVar(weights, p=0.01) print('Mean, Var', mu, var) # plot the model sigma = math.sqrt(var) print('Sigma', sigma) xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=12.5) thinkplot.Plot(xs, ps, label='model', color='0.8') # plot the data cdf = thinkstats2.Cdf(weights, label='data') thinkplot.PrePlot(1) thinkplot.Cdf(cdf) thinkplot.Save(root='analytic_birthwgt_model', title='Birth weights', xlabel='birth weight (lbs)', ylabel='CDF')