def MakePdfs(greq, less): greqpdf = thinkstats2.EstimatedPdf(greq.totalwgt_lb.dropna()) lesspdf = thinkstats2.EstimatedPdf(less.totalwgt_lb.dropna()) thinkplot.PrePlot(rows=1, cols=2) thinkplot.SubPlot(1) thinkplot.Pdf(greqpdf, label='greater/equal to 30') thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF') thinkplot.SubPlot(2) thinkplot.Pdf(lesspdf, label='less than 30') thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF') thinkplot.Show()
def RunSimpleProcess(gap_times, lam=0.0333, num_passengers=15, plot=True): """Runs the basic analysis and generates figures. gap_times: sequence of float lam: arrival rate in passengers per second num_passengers: int number of passengers on the platform plot: boolean, whether to generate plots Returns: WaitTimeCalculator, ElapsedTimeEstimator """ global UPPER_BOUND UPPER_BOUND = 1200 cdf_z = thinkstats2.Cdf(gap_times).Scale(1.0 / 60) print('CI z', cdf_z.CredibleInterval(90)) xs = MakeRange(low=10) pdf_z = thinkstats2.EstimatedPdf(gap_times) pmf_z = pdf_z.MakePmf(xs=xs, label="z") wtc = WaitTimeCalculator(pmf_z, inverse=False) if plot: wtc.PlotPmfs() wtc.MakePlot() ete = ElapsedTimeEstimator(wtc, lam, num_passengers) if plot: ete.MakePlot() return wtc, ete
def main(): filename = 'mystery0.dat' data = read_file(filename) pmf = thinkstats2.MakePmfFromList(data) cdf = thinkstats2.MakeCdfFromList(data) pdf = thinkstats2.EstimatedPdf(data) low, high = min(data), max(data) xs = numpy.linspace(low, high, 101) kde_pmf = pdf.MakePmf(xs) bin_data = BinData(data, low, high, 51) bin_pmf = thinkstats2.MakePmfFromList(bin_data) thinkplot.SubPlot(2, 2, 1) thinkplot.Hist(pmf, width=0.1) thinkplot.Config(title='Naive Pmf') thinkplot.SubPlot(2, 2, 2) thinkplot.Hist(bin_pmf) thinkplot.Config(title='Binned Hist') thinkplot.SubPlot(2, 2, 3) thinkplot.Pmf(kde_pmf) thinkplot.Config(title='KDE PDF') thinkplot.SubPlot(2, 2, 4) thinkplot.Cdf(cdf) thinkplot.Config(title='CDF') thinkplot.Show()
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000): means = [] for j in range(m): xs = [random.gauss(mu, sigma) for i in range(n)] xbar = numpy.mean(xs) means.append(xbar) print 'rmse', RMSE(means, mu) cdf = thinkstats2.MakeCdfFromList(means) print 'confidence interval', cdf.Percentile(5), cdf.Percentile(95) # estimate the PDF by KDE pdf = thinkstats2.EstimatedPdf(means) stderr = sigma / math.sqrt(n) vals = numpy.linspace(mu-3*stderr, mu+3*stderr, 101) pmf = pdf.MakePmf(vals) #thinkplot.Pmf(pmf) # plot the CDF thinkplot.Cdf(cdf) thinkplot.Save(root='estimate1', xlabel='sample mean', ylabel='CDF', title='Sampling distribution' )
def TestGte(): """Tests the GapTimeEstimator.""" random.seed(17) xs = [60, 120, 240] gap_times = [60, 60, 60, 60, 60, 120, 120, 120, 240, 240] # distribution of gap time (z) pdf_z = thinkstats2.EstimatedPdf(gap_times) pmf_z = pdf_z.MakePmf(xs=xs, label="z") wtc = WaitTimeCalculator(pmf_z, inverse=False) lam = 0.0333 n = 100 passenger_data = wtc.GenerateSamplePassengers(lam, n) pcounts = [0, 0, 0] ite = GapTimeEstimator(xs, pcounts, passenger_data) thinkplot.Clf() # thinkplot.Cdf(wtc.pmf_z.MakeCdf(label="actual z")) thinkplot.Cdf(wtc.pmf_zb.MakeCdf(label="actual zb")) ite.MakePlot()
def testEstimatedPdf(self): pdf = thinkstats2.EstimatedPdf([1, 2, 2, 3, 5]) self.assertEqual(len(str(pdf)), 30) self.assertAlmostEqual(pdf.Density(3)[0], 0.19629968) pmf = pdf.MakePmf() self.assertAlmostEqual(pmf[1.0], 0.010172282816895044) pmf = pdf.MakePmf(low=0, high=6) self.assertAlmostEqual(pmf[0.0], 0.0050742294053582942)
def ComputeSkewnesses(): """Plots KDE of birthweight and adult weight. """ def VertLine(x, y): thinkplot.Plot([x, x], [0, y], color='0.6', linewidth=1) live, firsts, others = first.MakeFrames() data = live.totalwgt_lb.dropna() print('Birth weight') mean, median = Summarize(data) y = 0.35 VertLine(mean, y) thinkplot.Text(mean - 0.15, 0.1 * y, 'mean', horizontalalignment='right') VertLine(median, y) thinkplot.Text(median + 0.1, 0.1 * y, 'median', horizontalalignment='left') pdf = thinkstats2.EstimatedPdf(data) thinkplot.Pdf(pdf, label='birth weight') thinkplot.Save(root='density_totalwgt_kde', xlabel='lbs', ylabel='PDF') df = brfss.ReadBrfss(nrows=None) data = df.wtkg2.dropna() print('Adult weight') mean, median = Summarize(data) y = 0.02499 VertLine(mean, y) thinkplot.Text(mean + 1, 0.1 * y, 'mean', horizontalalignment='left') VertLine(median, y) thinkplot.Text(median - 1.5, 0.1 * y, 'median', horizontalalignment='right') pdf = thinkstats2.EstimatedPdf(data) thinkplot.Pdf(pdf, label='adult weight') thinkplot.Save(root='density_wtkg2_kde', xlabel='kg', ylabel='PDF', xlim=[0, 200])
def RunLoop(gap_times, nums, lam=0.0333): """Runs the basic analysis for a range of num_passengers. gap_times: sequence of float nums: sequence of values for num_passengers lam: arrival rate in passengers per second Returns: WaitMixtureEstimator """ global UPPER_BOUND UPPER_BOUND = 4000 thinkplot.Clf() RandomSeed(18) # resample gap_times n = 220 cdf_z = thinkstats2.Cdf(gap_times) sample_z = cdf_z.Sample(n) pmf_z = thinkstats2.Pmf(sample_z) # compute the biased pmf and add some long delays cdf_zp = BiasPmf(pmf_z).MakeCdf() sample_zb = numpy.append(cdf_zp.Sample(n), [1800, 2400, 3000]) # smooth the distribution of zb pdf_zb = thinkstats2.EstimatedPdf(sample_zb) xs = MakeRange(low=60) pmf_zb = pdf_zb.MakePmf(xs=xs) # unbias the distribution of zb and make wtc pmf_z = UnbiasPmf(pmf_zb) wtc = WaitTimeCalculator(pmf_z) probs = [] for num_passengers in nums: ete = ElapsedTimeEstimator(wtc, lam, num_passengers) # compute the posterior prob of waiting more than 15 minutes cdf_y = ete.pmf_y.MakeCdf() prob = 1 - cdf_y.Prob(900) probs.append(prob) # thinkplot.Cdf(ete.pmf_y.MakeCdf(label=str(num_passengers))) thinkplot.Plot(nums, probs) thinkplot.Save( root='redline5', xlabel='Num passengers', ylabel='P(y > 15 min)', formats=FORMATS, )
def GenerateSampleData(gap_times, lam=0.0333, n=10): """Generates passenger data based on actual gap times. gap_times: sequence of float lam: arrival rate in passengers per second n: number of simulated observations """ xs = MakeRange(low=10) pdf_z = thinkstats2.EstimatedPdf(gap_times) pmf_z = pdf_z.MakePmf(xs=xs, label="z") wtc = WaitTimeCalculator(pmf_z, inverse=False) passenger_data = wtc.GenerateSamplePassengers(lam, n) return wtc, passenger_data
def main(): df = hinc.ReadData() log_sample = InterpolateSample(df, log_upper=6.0) log_cdf = thinkstats2.Cdf(log_sample) thinkplot.Cdf(log_cdf) thinkplot.Show(xlabel='household income', ylabel='CDF') sample = np.power(10, log_sample) mean, median = density.Summarize(sample) cdf = thinkstats2.Cdf(sample) print('cdf[mean]', cdf[mean]) pdf = thinkstats2.EstimatedPdf(sample) thinkplot.Pdf(pdf) thinkplot.Show(xlabel='household income', ylabel='PDF')
def main(): df = hinc.ReadData() log_sample = InterpolateSample(df, log_upper=6.0) log_cdf = thinkstats2.Cdf(log_sample) thinkplot.Cdf(log_cdf) thinkplot.Show(xlabel='household income', ylabel='CDF') sample = np.power(10, log_sample) mean = np.mean(sample) cdf = thinkstats2.Cdf(sample) print "Median:", np.median(sample) print "Mean:", mean print "Skewness:", thinkstats2.Skewness(sample) print "Pearson's Skewness:", thinkstats2.PearsonMedianSkewness(sample) print "Percent of people with incomes <= mean:", cdf[mean] pdf = thinkstats2.EstimatedPdf(sample) thinkplot.Pdf(pdf)
def MakePdfExample(): # mean and var of women's heights in cm, from the BRFSS mean, var = 163, 52.8 std = math.sqrt(var) # make a PDF and compute a density, FWIW pdf = thinkstats2.GaussianPdf(mean, std) print(pdf.Density(mean + std)) # make a PMF and plot it thinkplot.PrePlot(2) thinkplot.Pdf(pdf, label='Gaussian') # make a sample, make an estimated PDF, and plot it sample = [random.gauss(mean, std) for i in range(100)] sample_pdf = thinkstats2.EstimatedPdf(sample) thinkplot.Pdf(sample_pdf, label='sample KDE') thinkplot.Save(root='pdf_example', xlabel='Height (cm)', ylabel='Density')
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000): means = [] for j in range(m): xs = [random.gauss(mu, sigma) for i in range(n)] xbar = numpy.mean(xs) means.append(xbar) print 'rmse', RMSE(means, mu) cdf = thinkstats2.MakeCdfFromList(means) print 'confidence interval', cdf.Percentile(5), cdf.Percentile(95) pdf = thinkstats2.EstimatedPdf(means) stderr = sigma / math.sqrt(n) vals = numpy.linspace(mu - 3 * stderr, mu + 3 * stderr, 101) pmf = pdf.MakePmf(vals) #thinkplot.Pmf(pmf) thinkplot.Cdf(cdf) thinkplot.Show()
def MakePdfExample(n=500): """Plots a normal density function and a KDE estimate. n: sample size """ # mean and var of women's heights in cm, from the BRFSS mean, var = 163, 52.8 std = math.sqrt(var) # make a PDF and compute a density, FWIW pdf = thinkstats2.NormalPdf(mean, std) print(pdf.Density(mean + std)) # make a PMF and plot it thinkplot.PrePlot(2) thinkplot.Pdf(pdf, label='normal') # make a sample, make an estimated PDF, and plot it sample = [random.gauss(mean, std) for _ in range(n)] sample_pdf = thinkstats2.EstimatedPdf(sample) thinkplot.Pdf(sample_pdf, label='sample KDE') thinkplot.Save(root='pdf_example', xlabel='Height (cm)', ylabel='Density')
def main(): random.seed(17) # mean and var of women's heights in cm, from the BRFSS mean, var = 163, 52.8 sigma = math.sqrt(var) # make a PDF and compute a density, FWIW pdf = thinkstats2.GaussianPdf(mean, sigma) print pdf.Density(mean + sigma) # make a PMF and plot it xs = numpy.linspace(mean - 3 * sigma, mean + 3 * sigma, 100) pmf = pdf.MakePmf(xs) thinkplot.Pmf(pmf, label='Gaussian') # make a sample, make an estimated PDF, and plot it sample = [random.gauss(mean, sigma) for i in range(1000)] sample_pdf = thinkstats2.EstimatedPdf(sample) sample_pmf = sample_pdf.MakePmf(xs) thinkplot.Pmf(sample_pmf, label='KDE') thinkplot.Save(root='pdf_example', xlabel='Height (cm)', ylabel='Density')
import thinkplot thinkplot.Pdf(pdf, label='normal') thinkplot.Show(xlabel='height (cm)', ylabel='dencity') #%% pmf = pdf.MakePmf() #%% [markdown] # ## 6.2 KDE # # - Kernel density estimation #%% import random sample = [random.gauss(mean, std) for _ in range(500)] sample_pdf = thinkstats2.EstimatedPdf(sample) thinkplot.Pdf(sample_pdf, label='sample KDE') thinkplot.Pdf(pdf, label='normal') thinkplot.Show(xlabel='height (cm)', ylabel='dencity') #%% import numpy as np hist = thinkstats2.Hist(np.floor(sample)) thinkplot.Hist(hist) #%% cdf = thinkstats2.Cdf(np.floor(sample)) thinkplot.Cdf(cdf) #%% [markdown] #
pdf = thinkstats2.NormalPdf(mean, std) pmf = pdf.MakePmf() thinkplot.PrePlot(2) thinkplot.Pdf(pdf, label='normal pdf') thinkplot.Pmf(pmf, label='normal pmf') thinkplot.Show(xlabel='x', xlim=[140, 186]) ## KDE of normal pdf i = 6 thinkplot.PrePlot(i + 1) thinkplot.Pdf(pdf, label='normal') for _ in range(i): sample = np.random.normal(mean, std, 500) sample_pdf = thinkstats2.EstimatedPdf(sample, label='sample') thinkplot.Pdf(sample_pdf, label='sample KDE') thinkplot.Show(xlabel='x', ylabel='PDF', xlim=[140, 186]) ## calculate moments print('RawMoment') print(RawMoment(female_heights, 1), RawMoment(female_heights, 2)) print('\n CentralMoment') print(CentralMoment(female_heights, 1), CentralMoment(female_heights, 2), CentralMoment(female_heights, 3)) print('\n StandardizedMoment') print(StandardizedMoment(female_heights, 1), StandardizedMoment(female_heights, 2), StandardizedMoment(female_heights, 3))