def window_plot(): """Makes a plot showing a sinusoid, hamming window, and their product. """ signal = thinkdsp.SinSignal(freq=440) duration = signal.period * 10.25 wave1 = signal.make_wave(duration) wave2 = signal.make_wave(duration) ys = numpy.hamming(len(wave1.ys)) window = thinkdsp.Wave(ys, wave1.framerate) wave2.hamming() thinkplot.preplot(rows=3, cols=1) pyplot.subplots_adjust(wspace=0.3, hspace=0.3, right=0.95, left=0.1, top=0.95, bottom=0.05) thinkplot.subplot(1) wave1.plot() thinkplot.Config(axis=[0, duration, -1.07, 1.07]) thinkplot.subplot(2) window.plot() thinkplot.Config(axis=[0, duration, -1.07, 1.07]) thinkplot.subplot(3) wave2.plot() thinkplot.Config(axis=[0, duration, -1.07, 1.07], xlabel='time (s)') thinkplot.save(root='windowing2')
def main(): filename = 'mystery0.dat' data = read_file(filename) pmf = thinkstats2.MakePmfFromList(data) cdf = thinkstats2.MakeCdfFromList(data) pdf = thinkstats2.EstimatedPdf(data) low, high = min(data), max(data) xs = numpy.linspace(low, high, 101) kde_pmf = pdf.MakePmf(xs) bin_data = BinData(data, low, high, 51) bin_pmf = thinkstats2.MakePmfFromList(bin_data) thinkplot.SubPlot(2, 2, 1) thinkplot.Hist(pmf, width=0.1) thinkplot.Config(title='Naive Pmf') thinkplot.SubPlot(2, 2, 2) thinkplot.Hist(bin_pmf) thinkplot.Config(title='Binned Hist') thinkplot.SubPlot(2, 2, 3) thinkplot.Pmf(kde_pmf) thinkplot.Config(title='KDE PDF') thinkplot.SubPlot(2, 2, 4) thinkplot.Cdf(cdf) thinkplot.Config(title='CDF') thinkplot.Show()
def MakeBabyBoom(): """Plot CDF of interarrival time on log and linear scales. """ # compute the interarrival times df = ReadBabyBoom() diffs = df.minutes.diff() cdf = thinkstats2.Cdf(diffs, label='actual') thinkplot.PrePlot(cols=2) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='minutes', ylabel='CDF', legend=False) thinkplot.SubPlot(2) thinkplot.Cdf(cdf, complement=True) thinkplot.Config(xlabel='minutes', ylabel='CCDF', yscale='log', legend=False) thinkplot.Save(root='analytic_interarrivals') n = len(diffs) lam = 44 / 24 * 60.0 sample = [random.expovariate(lam) for _ in range(n)] model = thinkstats2.Cdf(sample, label='model') thinkplot.PrePlot(2) thinkplot.Cdfs([cdf, model], complement=True) thinkplot.Save(root='analytic_interarrivals_model', title='Time between births', xlabel='minutes', ylabel='CCDF', yscale='log')
def PlotRemainingLifetime(sf1, sf2): """Plots remaining lifetimes for pregnancy and age at first marriage. sf1: SurvivalFunction for pregnancy length sf2: SurvivalFunction for age at first marriage """ thinkplot.PrePlot(cols=2) rem_life1 = sf1.RemainingLifetime() thinkplot.Plot(rem_life1) thinkplot.Config(title='remaining pregnancy length', xlabel='weeks', ylabel='mean remaining weeks') thinkplot.SubPlot(2) func = lambda pmf: pmf.Percentile(50) rem_life2 = sf2.RemainingLifetime(filler=np.inf, func=func) thinkplot.Plot(rem_life2) thinkplot.Config(title='years until first marriage', ylim=[0, 15], xlim=[11, 31], xlabel='age (years)', ylabel='median remaining years') thinkplot.Save(root='survival6', formats=FORMATS)
def MakeFigures(df): """Generates CDFs and normal prob plots for weights and log weights.""" weights = df.wtkg2.dropna() log_weights = np.log10(weights) # plot weights on linear and log scales thinkplot.PrePlot(cols=2) MakeNormalModel(weights) thinkplot.Config(xlabel='adult weight (kg)', ylabel='CDF') thinkplot.SubPlot(2) MakeNormalModel(log_weights) thinkplot.Config(xlabel='adult weight (log10 kg)') thinkplot.Save(root='brfss_weight') # make normal probability plots on linear and log scales thinkplot.PrePlot(cols=2) MakeNormalPlot(weights) thinkplot.Config(xlabel='z', ylabel='weights (kg)') thinkplot.SubPlot(2) MakeNormalPlot(log_weights) thinkplot.Config(xlabel='z', ylabel='weights (log10 kg)') thinkplot.Save(root='brfss_weight_normal')
def MakePdfs(greq, less): greqpdf = thinkstats2.EstimatedPdf(greq.totalwgt_lb.dropna()) lesspdf = thinkstats2.EstimatedPdf(less.totalwgt_lb.dropna()) thinkplot.PrePlot(rows=1, cols=2) thinkplot.SubPlot(1) thinkplot.Pdf(greqpdf, label='greater/equal to 30') thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF') thinkplot.SubPlot(2) thinkplot.Pdf(lesspdf, label='less than 30') thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF') thinkplot.Show()
def pmf_stuff(width, x_low, x_high, third, pmf_one, pmf_two, label, y_axis_scale): width = width axis = [x_low, x_high, third, y_axis_scale] thinkplot.PrePlot(2, cols=2) thinkplot.Hist(pmf_one, align='right', width=width) thinkplot.Hist(pmf_two, align='left', width=width) thinkplot.Config(xlabel=label, ylabel='PMF', axis=axis) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([pmf_one, pmf_two]) thinkplot.Config(xlabel=label, ylabel='PMF', axis=axis) thinkplot.Show()
def MakeFigures(): """Plots the CDF of populations in several forms. On a log-log scale the tail of the CCDF looks like a straight line, which suggests a Pareto distribution, but that turns out to be misleading. On a log-x scale the distribution has the characteristic sigmoid of a lognormal distribution. The normal probability plot of log(sizes) confirms that the data fit the lognormal model very well. Many phenomena that have been described with Pareto models can be described as well, or better, with lognormal models. """ pops = ReadData() print('Number of cities/towns', len(pops)) log_pops = np.log10(pops) cdf = thinkstats2.Cdf(pops, label='data') cdf_log = thinkstats2.Cdf(log_pops, label='data') # pareto plot xs, ys = thinkstats2.RenderParetoCdf(xmin=5000, alpha=1.4, low=0, high=1e7) thinkplot.Plot(np.log10(xs), 1-ys, label='model', color='0.8') thinkplot.Cdf(cdf_log, complement=True) thinkplot.Config(xlabel='log10 population', ylabel='CCDF', yscale='log') thinkplot.Save(root='populations_pareto') # lognormal plot thinkplot.PrePlot(cols=2) mu, sigma = log_pops.mean(), log_pops.std() xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=8) thinkplot.Plot(xs, ps, label='model', color='0.8') thinkplot.Cdf(cdf_log) thinkplot.Config(xlabel='log10 population', ylabel='CDF') thinkplot.SubPlot(2) thinkstats2.NormalProbabilityPlot(log_pops, label='data') thinkplot.Config(xlabel='z', ylabel='log10 population', xlim=[-5, 5]) thinkplot.Save(root='populations_normal')
def PlotSurvivalFunctions(sf_map, predict_flag=False): """Plot estimated survival functions. sf_map: map from group name to sequence of survival functions predict_flag: whether the lines are predicted or actual """ thinkplot.PrePlot(len(sf_map)) for name, sf_seq in sorted(sf_map.items(), reverse=True): if len(sf_seq) == 0: continue sf = sf_seq[0] if len(sf) == 0: continue ts, rows = MakeSurvivalCI(sf_seq, [10, 50, 90]) thinkplot.FillBetween(ts, rows[0], rows[2], color='gray') if not predict_flag: thinkplot.Plot(ts, rows[1], label='19%d'%name) thinkplot.Config(xlabel='age (years)', ylabel='prob unmarried', xlim=[14, 45], ylim=[0, 1], legend=True, loc='upper right')
def main(): df = ReadData() cdf = thinkstats2.Cdf(df['ps']) thinkplot.PrePlot(rows=1, cols=2) thinkplot.SubPlot(1) scale = thinkplot.Cdf(cdf, xscale='log') thinkplot.Config(title='logx', **scale) thinkplot.SubPlot(2) scale = thinkplot.Cdf(cdf, transform='pareto') thinkplot.Config(title='pareto', **scale) thinkplot.show(legend=False) print(df)
def MakeArrivalDepartureDelayScatterPlots(flights): """Make scatterplots. """ sample = thinkstats2.SampleRows(flights, 10000) # simple scatter plot thinkplot.PrePlot(cols=2) # departureDelays, arrivalDelays = GetArrivalDepartureDelay(sample) # airports = sample.AIRLINE # arrivalDelays = sample.ARRIVAL_DELAY # ScatterPlot(airports, arrivalDelays) # scatter plot with jitter # thinkplot.SubPlot(2) departureDelays, arrivalDelays = GetArrivalDepartureDelay(sample, hjitter=1.3, wjitter=0.5) thinkplot.Scatter(arrivalDelays, departureDelays, alpha=1) thinkplot.Config( xlabel='arrival delay (min)', ylabel='departure delay (min)', # axis=[-20, 20, 20, 200], legend=False) thinkplot.Save(root='ArrivalDepartureDelayScatterplot')
def SimulateSample(lam=2, n=10, m=1000): """Sampling distribution of L as an estimator of exponential parameter. lam: parameter of an exponential distribution n: sample size m: number of iterations """ def VertLine(x, y=1): thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3) estimates = [] for j in range(m): xs = np.random.exponential(1 / lam, n) lamhat = 1 / np.mean(xs) estimates.append(lamhat) stderr = RMSE(estimates, lam) print('standard error', stderr) cdf = thinkstats2.Cdf(estimates) ci = cdf.Percentile(5), cdf.Percentile(95) print('confidence interval', ci) VertLine(ci[0]) VertLine(ci[1]) # plot the CDF thinkplot.Cdf(cdf) thinkplot.Config(root='estimation2', xlabel='estimate', ylabel='CDF', title='Sampling distribution') return stderr
def Sample(lam=2, iters=1000): """ Sampling dist. of L as an estimator of exp parameter """ def VertLine(x, y=1): thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3) # repreat for multiple values of n for n in [5, 10, 15]: estimates = [] for i in range(iters): xs = np.random.exponential(1 / lam, n) lamhat = 1 / np.mean(xs) estimates.append(lamhat) stderr = RMSE(estimates, lam) print("The standard error for n = {} is: {:.5f}".format(n, stderr)) cdf = thinkstats2.Cdf(estimates) ci = cdf.Percentile(5), cdf.Percentile(95) print("The 90'%' confidence interval is: {}".format(ci)) if n == 10: # set the lower/upper ends of confidence interval VertLine(ci[0]) VertLine(ci[1]) # plot the CDF thinkplot.Cdf(cdf) thinkplot.Config(xlabel='estimate', ylabel='CDF', title='Sampling Distribution')
def ScatterPlot(ages, weights, alpha=1.0): thinkplot.Scatter(ages, weights, alpha=alpha) thinkplot.Config(xlabel='age (years)', ylabel='weight (lbs)', xlim=[10, 45], ylim=[0, 15], legend=False)
def MakeStep(greq, less): axis = [0, 50, 0, 0.6] greqpmf = thinkstats2.Pmf(greq.prglngth, label='greater/equal to 30') lesspmf = thinkstats2.Pmf(less.prglngth, label='less than 30') thinkplot.Pmfs([greqpmf, lesspmf]) thinkplot.Config(xlabel='Pregnancy length(weeks)', axis=axis) thinkplot.Show()
def PlotScatter(age, wgt, xmin, xmax, ymin, ymax): thinkplot.Scatter(age, wgt, alpha=1.0) thinkplot.Config(xlabel='Age (Years)', ylabel='Birth Weight (lbs)', xlim=[xmin, xmax], ylim=[ymin, ymax], legend=False) thinkplot.Show()
def PlotDailies(dailies): """Makes a plot with daily prices for different qualities. dailies: map from name to DataFrame """ thinkplot.PrePlot(rows=3) for i, (name, daily) in enumerate(dailies.items()): thinkplot.SubPlot(i + 1) title = 'price per gram ($)' if i == 0 else '' thinkplot.Config(ylim=[0, 20], title=title) thinkplot.Scatter(daily.ppg, s=10, label=name) if i == 2: pyplot.xticks(rotation=30) else: thinkplot.Config(xticks=[]) thinkplot.Save(root='timeseries1', formats=FORMATS)
def HexBin(ages, weights, bins=None): """Make a hexbin plot and save it. ages: sequence of float weights: sequence of float bins: 'log' or None for linear """ thinkplot.HexBin(ages, weights, bins=bins) thinkplot.Config(xlabel='age (years)', ylabel='weight (lbs)', legend=False)
def MakeCdfs(male, female): malecdf = thinkstats2.Cdf(male.totalwgt_lb, label='Male') femalecdf = thinkstats2.Cdf(female.totalwgt_lb, label='Female') thinkplot.PrePlot(2) thinkplot.Cdfs([malecdf, femalecdf]) thinkplot.Config(xlabel='Baby Weight (Lbs)', ylabel='CDF', title='Baby Weights') thinkplot.Show()
def MakeCdfs(male, female): malecdf = thinkstats2.Cdf(male.alcwknd, label='Male') femalecdf = thinkstats2.Cdf(female.alcwknd, label='Female') thinkplot.PrePlot(2) thinkplot.Cdfs([malecdf, femalecdf]) thinkplot.Config(xlabel='Alcohol Consumed (grams)', ylabel='CDF', title='Weekend Alcohol Consumption') thinkplot.Show()
def PrintDiffMeansOneSided( data, title="CDF of sampling distribution of null hypothesis", label="difference in mean album score"): ht = hyp.DiffMeansOneSided(data) pvalue = ht.PValue() ht.PlotCdf(label='CDF') tp.Config(loc=2) tp.Show(xlabel=label, ylabel='CDF', title=title) print("Calculated p-value:", pvalue)
def MakePmfs(greq, less): axis = [0, 15, 0, 0.04] width = .4 / 16 greqpmf = thinkstats2.Pmf(greq.totalwgt_lb, label='greater/equal to 30') lesspmf = thinkstats2.Pmf(less.totalwgt_lb, label='less than 30') thinkplot.Pmf(lesspmf, align='left', width=width) thinkplot.Pmf(greqpmf, align='right', width=width) thinkplot.Config(axis=axis) thinkplot.Show()
def MakeCdfs(greq, less): greqcdf = thinkstats2.Cdf(greq.totalwgt_lb, label='greater/equal to 30') lesscdf = thinkstats2.Cdf(less.totalwgt_lb, label='less than 30') thinkplot.PrePlot(2) thinkplot.Cdfs([greqcdf, lesscdf]) thinkplot.Config(xlabel='Weight (lbs)', ylabel='CDF') thinkplot.Show() print 'Greater/equal to 30 50th percentile:', greqcdf.Percentile(50) print 'Less than 30 50th percentile:', lesscdf.Percentile(50)
def MakePmfs(greq, less): width = 0.45 axis = [0, 50, 0, 0.6] greqpmf = thinkstats2.Pmf(greq.prglngth, label='greater/equal to 30') lesspmf = thinkstats2.Pmf(less.prglngth, label='less than 30') thinkplot.Hist(lesspmf, align='left', width=width) thinkplot.Hist(greqpmf, align='right', width=width) thinkplot.Config(axis=axis) thinkplot.Show()
def MakeBabyBoom(): """Plot CDF of interarrival time on log and linear scales. """ # compute the interarrival times df = ReadBabyBoom() diffs = df.minutes.diff() cdf = thinkstats2.Cdf(diffs, label='actual') thinkplot.PrePlot(cols=2) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='minutes', ylabel='CDF', legend=False) thinkplot.SubPlot(2) thinkplot.Cdf(cdf, complement=True) thinkplot.Config(xlabel='minutes', ylabel='CCDF', yscale='log', legend=False) thinkplot.Save(root='analytic_interarrivals', legend=False)
def MakeStep(male, female): axis = [0, 800, 0, 0.1] malepmf = thinkstats2.Pmf(male.alcwknd, label='Male') femalepmf = thinkstats2.Pmf(female.alcwknd, label='Female') thinkplot.Pmfs([malepmf, femalepmf]) thinkplot.Config(xlabel='Alcohol Consumption (grams)', ylabel='PMF', axis=axis, title='Weekend Alcohol Consumption') thinkplot.Show()
def PlotFilled(daily, name): """Plot the EWMA and filled data. daily: DataFrame of daily prices name: string """ filled = FillMissing(daily, span=30) thinkplot.Scatter(filled.ppg, s=15, alpha=0.2, label=name) thinkplot.Plot(filled.ewma, label='EWMA', color='#ff7f00') plt.xticks(rotation=30) thinkplot.Config(label='Price per gram ($)') thinkplot.Show()
def PlotSurvival(complete): """Plots survival and hazard curves. complete: list of complete lifetimes """ thinkplot.PrePlot(3, rows=2) cdf = thinkstats2.Cdf(complete, label='cdf') sf = MakeSurvivalFromCdf(cdf, label='survival') print(cdf[13]) print(sf[13]) thinkplot.Plot(sf) thinkplot.Cdf(cdf, alpha=0.2) thinkplot.Config() thinkplot.SubPlot(2) hf = sf.MakeHazardFunction(label='hazard') print(hf[39]) thinkplot.Plot(hf) thinkplot.Config(ylim=[0, 0.75])
def ScatterPlot(heights, weights, alpha=1.0): """Make a scatter plot and save it. heights: sequence of float weights: sequence of float alpha: float """ thinkplot.Scatter(heights, weights, alpha=alpha) thinkplot.Config(xlabel='height (cm)', ylabel='weight (kg)', axis=[140, 210, 20, 200], legend=False)
def HexBin(heights, weights, bins=None): """Make a hexbin plot and save it. heights: sequence of float weights: sequence of float bins: 'log' or None for linear """ thinkplot.HexBin(heights, weights, bins=bins) thinkplot.Config(xlabel='height (cm)', ylabel='weight (kg)', axis=[140, 210, 20, 200], legend=False)