def EstimateHazardFuncion(past, current): """Estimates the hazard function by Kaplan-Meier. http://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator past: list of durations for complete pregnancies current: list of durations for current pregnancies """ # pmf of pregnancies known to have ended at each timestep pmf = thinkstats2.MakePmfFromList(past) # survival curve for the known pregnancy lengths n = len(past) cdf_dur = thinkstats2.MakeCdfFromList(past) ts, ss = SurvivalFunction(cdf_dur) # CDF of duration for current pregnancies m = len(current) cdf_cur = thinkstats2.MakeCdfFromList(current) hazard_func = [] for t, s in zip(ts, ss): ended = n * pmf.Prob(t) ongoing = n * s + m * (1 - cdf_cur.Prob(t)) at_risk = ended + ongoing hazard = ended / at_risk hazard_func.append((t, hazard)) return zip(*hazard_func)
def SamplingDistributions(fxs, fys, res, n=10): res_copy = list(res) t = [] for i in range(n): estimates = Permute(fxs, fys, res) t.append(estimates) inters, slopes = zip(*t) inter_cdf = thinkstats2.MakeCdfFromList(inters) slope_cdf = thinkstats2.MakeCdfFromList(slopes) return inter_cdf, slope_cdf
def SimulateSample(lam=2, n=10, m=1000): def VertLine(x, y=1): thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3) estimates = [] for j in range(m): xs = np.random.exponential(1.0/lam, n) lamhat = 1.0 / np.mean(xs) estimates.append(lamhat) stderr = RMSE(estimates, lam) print('standard error', stderr) cdf = thinkstats2.MakeCdfFromList(estimates) ci = cdf.Percentile(5), cdf.Percentile(95) print('confidence interval', ci) VertLine(ci[0]) VertLine(ci[1]) # plot the CDF thinkplot.Cdf(cdf) thinkplot.Save(root='estimation2', xlabel='estimate', ylabel='CDF', title='Sampling distribution') return stderr
def main(script, filename='mystery0.dat'): data = read_file(filename) cdf = thinkstats2.MakeCdfFromList(data) thinkplot.PrePlot(rows=2, cols=3) thinkplot.SubPlot(1) thinkplot.Cdf(cdf) thinkplot.Config(title='linear') thinkplot.SubPlot(2) scale = thinkplot.Cdf(cdf, xscale='log') thinkplot.Config(title='logx', **scale) thinkplot.SubPlot(3) scale = thinkplot.Cdf(cdf, transform='exponential') thinkplot.Config(title='expo', **scale) thinkplot.SubPlot(4) xs, ys = thinkstats2.NormalProbability(data) thinkplot.Plot(xs, ys) thinkplot.Config(title='normal') thinkplot.SubPlot(5) scale = thinkplot.Cdf(cdf, transform='pareto') thinkplot.Config(title='pareto', **scale) thinkplot.SubPlot(6) scale = thinkplot.Cdf(cdf, transform='weibull') thinkplot.Config(title='weibull', **scale) thinkplot.Show(legend=False)
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000): def VertLine(x, y=1): thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3) means = [] for j in range(m): xs = np.random.normal(mu, sigma, n) xbar = np.mean(xs) means.append(xbar) stderr = RMSE(means, mu) print('standard error', stderr) cdf = thinkstats2.MakeCdfFromList(means) ci = cdf.Percentile(5), cdf.Percentile(95) print('confidence interval', ci) VertLine(ci[0]) VertLine(ci[1]) # plot the CDF thinkplot.Cdf(cdf) thinkplot.Save(root='estimation1', xlabel='sample mean', ylabel='CDF', title='Sampling distribution')
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000): means = [] for j in range(m): xs = [random.gauss(mu, sigma) for i in range(n)] xbar = numpy.mean(xs) means.append(xbar) print 'rmse', RMSE(means, mu) cdf = thinkstats2.MakeCdfFromList(means) print 'confidence interval', cdf.Percentile(5), cdf.Percentile(95) # estimate the PDF by KDE pdf = thinkstats2.EstimatedPdf(means) stderr = sigma / math.sqrt(n) vals = numpy.linspace(mu-3*stderr, mu+3*stderr, 101) pmf = pdf.MakePmf(vals) #thinkplot.Pmf(pmf) # plot the CDF thinkplot.Cdf(cdf) thinkplot.Save(root='estimate1', xlabel='sample mean', ylabel='CDF', title='Sampling distribution' )
def main(): filename = 'mystery0.dat' data = read_file(filename) pmf = thinkstats2.MakePmfFromList(data) cdf = thinkstats2.MakeCdfFromList(data) pdf = thinkstats2.EstimatedPdf(data) low, high = min(data), max(data) xs = numpy.linspace(low, high, 101) kde_pmf = pdf.MakePmf(xs) bin_data = BinData(data, low, high, 51) bin_pmf = thinkstats2.MakePmfFromList(bin_data) thinkplot.SubPlot(2, 2, 1) thinkplot.Hist(pmf, width=0.1) thinkplot.Config(title='Naive Pmf') thinkplot.SubPlot(2, 2, 2) thinkplot.Hist(bin_pmf) thinkplot.Config(title='Binned Hist') thinkplot.SubPlot(2, 2, 3) thinkplot.Pmf(kde_pmf) thinkplot.Config(title='KDE PDF') thinkplot.SubPlot(2, 2, 4) thinkplot.Cdf(cdf) thinkplot.Config(title='CDF') thinkplot.Show()
def main(): filename = 'mystery0.dat' data = read_file(filename) cdf = thinkstats2.MakeCdfFromList(data) thinkplot.SubPlot(2, 3, 1) thinkplot.Cdf(cdf) thinkplot.Config(title='linear') thinkplot.SubPlot(2, 3, 2) scale = thinkplot.Cdf(cdf, xscale='log') thinkplot.Config(title='logx', **scale) thinkplot.SubPlot(2, 3, 3) scale = thinkplot.Cdf(cdf, transform='exponential') thinkplot.Config(title='expo', **scale) thinkplot.SubPlot(2, 3, 4) xs, ys = thinkstats2.NormalProbability(data) thinkplot.Plot(xs, ys) thinkplot.Config(title='normal') thinkplot.SubPlot(2, 3, 5) scale = thinkplot.Cdf(cdf, transform='pareto') thinkplot.Config(title='pareto', **scale) thinkplot.SubPlot(2, 3, 6) scale = thinkplot.Cdf(cdf, transform='weibull') thinkplot.Config(title='weibull', **scale) thinkplot.Show()
def process_noise(signal, root='red'): wave = signal.make_wave(duration=0.5, framerate=11025) # 0: waveform segment = wave.segment(duration=0.1) segment.plot(linewidth=1, alpha=0.5) thinkplot.save(root=root + 'noise0', xlabel='time (s)', ylabel='amplitude') spectrum = wave.make_spectrum() # 1: spectrum spectrum.plot_power(linewidth=1, alpha=0.5) thinkplot.save(root=root + 'noise1', xlabel='frequency (Hz)', ylabel='power density') slope, _, _, _, _ = spectrum.estimate_slope() print 'estimated slope', slope # 2: integrated spectrum integ = spectrum.make_integrated_spectrum() integ.plot_power() thinkplot.save(root=root + 'noise2', xlabel='frequency (Hz)', ylabel='normalized power') # 3: log-log spectral density spectrum.plot_power(low=1, linewidth=1, alpha=0.5) thinkplot.save(root=root + 'noise3', xlabel='frequency (Hz)', ylabel='power density', xscale='log', yscale='log') # 4: CDF of power density cdf = thinkstats2.MakeCdfFromList(spectrum.power) thinkplot.cdf(cdf) thinkplot.save(root=root + 'noise4', xlabel='power density', ylabel='CDF') # 5: CCDF of power density, log-y thinkplot.cdf(cdf, complement=True) thinkplot.save(root=root + 'noise5', xlabel='power density', ylabel='log(CCDF)', yscale='log') thinkstats2.NormalProbabilityPlot(spectrum.real, label='real', data_color='#253494') thinkstats2.NormalProbabilityPlot(spectrum.imag - 50, label='imag-50', data_color='#1D91C0') thinkplot.save(root=root + 'noise6', xlabel='normal sample', ylabel='power density')
def Process(table, name): """Runs various analyses on this table. Creates instance variables: ages: sequence of int ages in years age_pmf: Pmf object age_cdf: Cdf object weights: sequence of total weight in ounces weight_cdf: Cdf object """ cumulative.Process(table, name) table.ages = [p.agepreg for p in table.records if p.agepreg != 'NA'] table.age_pmf = thinkstats2.MakePmfFromList(table.ages, table.name) table.age_cdf = thinkstats2.MakeCdfFromList(table.ages, table.name) table.weights = [p.totalwgt_oz for p in table.records if p.totalwgt_oz != 'NA'] table.weight_cdf = thinkstats2.MakeCdfFromList(table.weights, table.name)
def Simulate_Sample(lam, n, m=1000): means = [] medians = [] for _ in range(m): xs = np.random.exponential(1.0 / lam, n) L = 1 / np.mean(xs) means.append(L) cdf = thinkstats2.MakeCdfFromList(means) stderr = estimation.RMSE(means, lam) ci = cdf.Percentile(5), cdf.Percentile(95) return cdf, stderr, ci
def PValue(self, iters=1000): """Computes the sample distribution of the test statistic and p-value. iters: number of iterations returns: Cdf object, float p-value """ self.sample_stats = [ self.TestStatistic(self.RunModel()) for i in range(iters) ] self.sample_cdf = thinkstats2.MakeCdfFromList(self.sample_stats) p_value = 1 - self.sample_cdf.Prob(self.actual) return p_value
def plot_power_density(root, spectrum): """ """ # 4: CDF of power density cdf = thinkstats2.MakeCdfFromList(spectrum.power) thinkplot.cdf(cdf) thinkplot.save(root=root + 'noise4', xlabel='power density', ylabel='CDF') # 5: CCDF of power density, log-y thinkplot.cdf(cdf, complement=True) thinkplot.save(root=root + 'noise5', xlabel='power density', ylabel='log(CCDF)', yscale='log')
def PlotSurvival(durations): """Plots survival and hazard curves. durations: list of durations """ cdf = thinkstats2.MakeCdfFromList(durations) thinkplot.Cdf(cdf, alpha=0.1) thinkplot.PrePlot(2) ts, ss = SurvivalFunction(cdf) thinkplot.Plot(ts, ss, label="S(t)") haz_func = HazardFunction(ts, ss) thinkplot.Pmf(haz_func, label='lam(t)') thinkplot.Show(xlabel='t (weeks)')
def PlotHazard(past, current): """Plots the hazard function and survival function. past: list of durations for complete pregnancies current: list of durations for current pregnancies """ # plot S(t) based on only past pregnancies cdf = thinkstats2.MakeCdfFromList(past) ts, ss = SurvivalFunction(cdf) thinkplot.Plot(ts, ss, label='old S(t)', alpha=0.1) thinkplot.PrePlot(2) ts, lams = EstimateHazardFuncion(past, current) thinkplot.Plot(ts, lams, label='lams(t)', alpha=0.5) ts, ss = MakeSurvivalFromHazard(ts, lams) thinkplot.Plot(ts, ss, label='S(t)') thinkplot.Show(xlabel='t (weeks)')
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000): means = [] for j in range(m): xs = [random.gauss(mu, sigma) for i in range(n)] xbar = numpy.mean(xs) means.append(xbar) print 'rmse', RMSE(means, mu) cdf = thinkstats2.MakeCdfFromList(means) print 'confidence interval', cdf.Percentile(5), cdf.Percentile(95) pdf = thinkstats2.EstimatedPdf(means) stderr = sigma / math.sqrt(n) vals = numpy.linspace(mu - 3 * stderr, mu + 3 * stderr, 101) pmf = pdf.MakePmf(vals) #thinkplot.Pmf(pmf) thinkplot.Cdf(cdf) thinkplot.Show()
def testCdf(self): t = [1, 2, 2, 3, 5] pmf = thinkstats2.Pmf(t) hist = thinkstats2.Hist(t) cdf = thinkstats2.Cdf(pmf) self.assertEqual(len(str(cdf)), 37) self.assertEqual(cdf[0], 0) self.assertAlmostEqual(cdf[1], 0.2) self.assertAlmostEqual(cdf[2], 0.6) self.assertAlmostEqual(cdf[3], 0.8) self.assertAlmostEqual(cdf[4], 0.8) self.assertAlmostEqual(cdf[5], 1) self.assertAlmostEqual(cdf[6], 1) xs = range(7) ps = cdf.Probs(xs) for p1, p2 in zip(ps, [0, 0.2, 0.6, 0.8, 0.8, 1, 1]): self.assertAlmostEqual(p1, p2) self.assertEqual(cdf.Value(0), 1) self.assertEqual(cdf.Value(0.1), 1) self.assertEqual(cdf.Value(0.2), 1) self.assertEqual(cdf.Value(0.3), 2) self.assertEqual(cdf.Value(0.4), 2) self.assertEqual(cdf.Value(0.5), 2) self.assertEqual(cdf.Value(0.6), 2) self.assertEqual(cdf.Value(0.7), 3) self.assertEqual(cdf.Value(0.8), 3) self.assertEqual(cdf.Value(0.9), 5) self.assertEqual(cdf.Value(1), 5) ps = np.linspace(0, 1, 11) xs = cdf.ValueArray(ps) self.assertTrue((xs == [1, 1, 1, 2, 2, 2, 2, 3, 3, 5, 5]).all()) np.random.seed(17) xs = cdf.Sample(7) self.assertListEqual(xs.tolist(), [2, 2, 1, 1, 3, 3, 3]) # when you make a Cdf from a Pdf, you might get some floating # point representation error self.assertEqual(len(cdf), 4) self.assertAlmostEqual(cdf.Prob(2), 0.6) self.assertAlmostEqual(cdf[2], 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromPmf(pmf) self.assertEqual(len(cdf), 4) self.assertAlmostEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromItems(pmf.Items()) self.assertEqual(len(cdf), 4) self.assertAlmostEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(pmf.d) self.assertEqual(len(cdf), 4) self.assertAlmostEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromDict(pmf.d) self.assertEqual(len(cdf), 4) self.assertAlmostEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(hist) self.assertEqual(len(cdf), 4) self.assertEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromHist(hist) self.assertEqual(len(cdf), 4) self.assertEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(t) self.assertEqual(len(cdf), 4) self.assertEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromList(t) self.assertEqual(len(cdf), 4) self.assertEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(Counter(t)) self.assertEqual(len(cdf), 4) self.assertEqual(cdf.Prob(2), 0.6) self.assertEqual(cdf.Value(0.6), 2) cdf2 = cdf.Copy() self.assertEqual(cdf2.Prob(2), 0.6) self.assertEqual(cdf2.Value(0.6), 2)
def Medinan(xs): cdf = thinkstats2.MakeCdfFromList(xs) return cdf.Value(0.5)
def testCdf(self): t = [1, 2, 2, 3, 5] pmf = thinkstats2.Pmf(t) hist = thinkstats2.Hist(t) cdf = thinkstats2.Cdf(pmf) self.assertEquals(len(str(cdf)), 40) # when you make a Cdf from a Pdf, you might get some floating # point representation error self.assertEquals(len(cdf), 4) self.assertAlmostEquals(cdf.Prob(2), 0.6) self.assertAlmostEquals(cdf[2], 0.6) self.assertEquals(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromPmf(pmf) self.assertEquals(len(cdf), 4) self.assertAlmostEquals(cdf.Prob(2), 0.6) self.assertEquals(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(pmf.Items()) self.assertEquals(len(cdf), 4) self.assertAlmostEquals(cdf.Prob(2), 0.6) self.assertEquals(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromItems(pmf.Items()) self.assertEquals(len(cdf), 4) self.assertAlmostEquals(cdf.Prob(2), 0.6) self.assertEquals(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(pmf.d) self.assertEquals(len(cdf), 4) self.assertAlmostEquals(cdf.Prob(2), 0.6) self.assertEquals(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromDict(pmf.d) self.assertEquals(len(cdf), 4) self.assertAlmostEquals(cdf.Prob(2), 0.6) self.assertEquals(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(hist) self.assertEquals(len(cdf), 4) self.assertEquals(cdf.Prob(2), 0.6) self.assertEquals(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromHist(hist) self.assertEquals(len(cdf), 4) self.assertEquals(cdf.Prob(2), 0.6) self.assertEquals(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(t) self.assertEquals(len(cdf), 4) self.assertEquals(cdf.Prob(2), 0.6) self.assertEquals(cdf.Value(0.6), 2) cdf = thinkstats2.MakeCdfFromList(t) self.assertEquals(len(cdf), 4) self.assertEquals(cdf.Prob(2), 0.6) self.assertEquals(cdf.Value(0.6), 2) cdf = thinkstats2.Cdf(Counter(t)) self.assertEquals(len(cdf), 4) self.assertEquals(cdf.Prob(2), 0.6) self.assertEquals(cdf.Value(0.6), 2) cdf2 = cdf.Copy() self.assertEquals(cdf2.Prob(2), 0.6) self.assertEquals(cdf2.Value(0.6), 2)
weights = live.totalwgt_lb cdf = thinkstats2.Cdf(weights, label='totalwgt_lb') sample = np.random.choice(weights, 1000, replace=True) ranks = [cdf.PercentileRank(x) for x in sample] rank_cdf = thinkstats2.Cdf(ranks) thinkplot.Cdf(rank_cdf) thinkplot.Show(xlabel='percentile rank', ylabel='CDF') ## my birth weight my_weight = 8 + 4 / 16 my_rank = first_cdf.PercentileRank(my_weight) print('my_rank:\n', my_rank) calc_weight = first_cdf.Value(my_rank / 100) print('calc_weight:\n', calc_weight) ## observe random number distribution uni = [] gauss = [] for i in range(10000): uni.append(random.random()) gauss.append(np.random.normal()) uni_cdf = thinkstats2.MakeCdfFromList(uni, label='uniform') gauss_cdf = thinkstats2.MakeCdfFromList(gauss, label='gauss') thinkplot.PrePlot(2) thinkplot.Cdf(uni_cdf) thinkplot.Cdf(gauss_cdf) thinkplot.Show(xlabel='value', ylabel='CDF')
def main(script, filename='data'): t = read_file(filename) cdf = thinkstats2.MakeCdfFromList(t) thinkplot.Cdf(cdf) thinkplot.Show()