コード例 #1
0
ファイル: survival.py プロジェクト: wu12345/ThinkStats2
def EstimateHazardFuncion(past, current):
    """Estimates the hazard function by Kaplan-Meier.

    http://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator

    past: list of durations for complete pregnancies
    current: list of durations for current pregnancies    
    """
    # pmf of pregnancies known to have ended at each timestep
    pmf = thinkstats2.MakePmfFromList(past)

    # survival curve for the known pregnancy lengths
    n = len(past)
    cdf_dur = thinkstats2.MakeCdfFromList(past)
    ts, ss = SurvivalFunction(cdf_dur)

    # CDF of duration for current pregnancies
    m = len(current)
    cdf_cur = thinkstats2.MakeCdfFromList(current)

    hazard_func = []

    for t, s in zip(ts, ss):
        ended = n * pmf.Prob(t)
        ongoing = n * s + m * (1 - cdf_cur.Prob(t))
        at_risk = ended + ongoing
        hazard = ended / at_risk
        hazard_func.append((t, hazard))

    return zip(*hazard_func)
コード例 #2
0
def SamplingDistributions(fxs, fys, res, n=10):
    res_copy = list(res)

    t = []
    for i in range(n):
        estimates = Permute(fxs, fys, res)
        t.append(estimates)

    inters, slopes = zip(*t)
    inter_cdf = thinkstats2.MakeCdfFromList(inters)
    slope_cdf = thinkstats2.MakeCdfFromList(slopes)

    return inter_cdf, slope_cdf
コード例 #3
0
def SimulateSample(lam=2, n=10, m=1000):
    def VertLine(x, y=1):
        thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3)

    estimates = []
    for j in range(m):
        xs = np.random.exponential(1.0/lam, n)
        lamhat = 1.0 / np.mean(xs)
        estimates.append(lamhat)

    stderr = RMSE(estimates, lam)
    print('standard error', stderr)

    cdf = thinkstats2.MakeCdfFromList(estimates)
    ci = cdf.Percentile(5), cdf.Percentile(95)
    print('confidence interval', ci)
    VertLine(ci[0])
    VertLine(ci[1])

    # plot the CDF
    thinkplot.Cdf(cdf)
    thinkplot.Save(root='estimation2',
                   xlabel='estimate',
                   ylabel='CDF',
                   title='Sampling distribution')

    return stderr
コード例 #4
0
def main(script, filename='mystery0.dat'):
    data = read_file(filename)
    cdf = thinkstats2.MakeCdfFromList(data)

    thinkplot.PrePlot(rows=2, cols=3)
    thinkplot.SubPlot(1)
    thinkplot.Cdf(cdf)
    thinkplot.Config(title='linear')

    thinkplot.SubPlot(2)
    scale = thinkplot.Cdf(cdf, xscale='log')
    thinkplot.Config(title='logx', **scale)

    thinkplot.SubPlot(3)
    scale = thinkplot.Cdf(cdf, transform='exponential')
    thinkplot.Config(title='expo', **scale)

    thinkplot.SubPlot(4)
    xs, ys = thinkstats2.NormalProbability(data)
    thinkplot.Plot(xs, ys)
    thinkplot.Config(title='normal')

    thinkplot.SubPlot(5)
    scale = thinkplot.Cdf(cdf, transform='pareto')
    thinkplot.Config(title='pareto', **scale)

    thinkplot.SubPlot(6)
    scale = thinkplot.Cdf(cdf, transform='weibull')
    thinkplot.Config(title='weibull', **scale)

    thinkplot.Show(legend=False)
コード例 #5
0
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000):
    def VertLine(x, y=1):
        thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3)

    means = []
    for j in range(m):
        xs = np.random.normal(mu, sigma, n)
        xbar = np.mean(xs)
        means.append(xbar)

    stderr = RMSE(means, mu)
    print('standard error', stderr)

    cdf = thinkstats2.MakeCdfFromList(means)
    ci = cdf.Percentile(5), cdf.Percentile(95)
    print('confidence interval', ci)
    VertLine(ci[0])
    VertLine(ci[1])

    # plot the CDF
    thinkplot.Cdf(cdf)
    thinkplot.Save(root='estimation1',
                   xlabel='sample mean',
                   ylabel='CDF',
                   title='Sampling distribution')
コード例 #6
0
ファイル: estimate1.py プロジェクト: wu12345/ThinkStats2
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000):
    
    means = []
    for j in range(m):
        xs = [random.gauss(mu, sigma) for i in range(n)]
        xbar = numpy.mean(xs)
        means.append(xbar)

    print 'rmse', RMSE(means, mu)

    cdf = thinkstats2.MakeCdfFromList(means)
    print 'confidence interval', cdf.Percentile(5), cdf.Percentile(95) 

    # estimate the PDF by KDE
    pdf = thinkstats2.EstimatedPdf(means)
    stderr = sigma / math.sqrt(n)
    vals = numpy.linspace(mu-3*stderr, mu+3*stderr, 101)
    pmf = pdf.MakePmf(vals)
    #thinkplot.Pmf(pmf)

    # plot the CDF
    thinkplot.Cdf(cdf)
    thinkplot.Save(root='estimate1',
                   xlabel='sample mean',
                   ylabel='CDF',
                   title='Sampling distribution'
                   )
コード例 #7
0
def main():
    filename = 'mystery0.dat'
    data = read_file(filename)

    pmf = thinkstats2.MakePmfFromList(data)
    cdf = thinkstats2.MakeCdfFromList(data)

    pdf = thinkstats2.EstimatedPdf(data)
    low, high = min(data), max(data)
    xs = numpy.linspace(low, high, 101)
    kde_pmf = pdf.MakePmf(xs)

    bin_data = BinData(data, low, high, 51)
    bin_pmf = thinkstats2.MakePmfFromList(bin_data)

    thinkplot.SubPlot(2, 2, 1)
    thinkplot.Hist(pmf, width=0.1)
    thinkplot.Config(title='Naive Pmf')

    thinkplot.SubPlot(2, 2, 2)
    thinkplot.Hist(bin_pmf)
    thinkplot.Config(title='Binned Hist')

    thinkplot.SubPlot(2, 2, 3)
    thinkplot.Pmf(kde_pmf)
    thinkplot.Config(title='KDE PDF')

    thinkplot.SubPlot(2, 2, 4)
    thinkplot.Cdf(cdf)
    thinkplot.Config(title='CDF')

    thinkplot.Show()
コード例 #8
0
def main():
    filename = 'mystery0.dat'
    data = read_file(filename)
    cdf = thinkstats2.MakeCdfFromList(data)

    thinkplot.SubPlot(2, 3, 1)
    thinkplot.Cdf(cdf)
    thinkplot.Config(title='linear')

    thinkplot.SubPlot(2, 3, 2)
    scale = thinkplot.Cdf(cdf, xscale='log')
    thinkplot.Config(title='logx', **scale)

    thinkplot.SubPlot(2, 3, 3)
    scale = thinkplot.Cdf(cdf, transform='exponential')
    thinkplot.Config(title='expo', **scale)

    thinkplot.SubPlot(2, 3, 4)
    xs, ys = thinkstats2.NormalProbability(data)
    thinkplot.Plot(xs, ys)
    thinkplot.Config(title='normal')

    thinkplot.SubPlot(2, 3, 5)
    scale = thinkplot.Cdf(cdf, transform='pareto')
    thinkplot.Config(title='pareto', **scale)

    thinkplot.SubPlot(2, 3, 6)
    scale = thinkplot.Cdf(cdf, transform='weibull')
    thinkplot.Config(title='weibull', **scale)

    thinkplot.Show()
コード例 #9
0
def process_noise(signal, root='red'):
    wave = signal.make_wave(duration=0.5, framerate=11025)

    # 0: waveform
    segment = wave.segment(duration=0.1)
    segment.plot(linewidth=1, alpha=0.5)
    thinkplot.save(root=root + 'noise0', xlabel='time (s)', ylabel='amplitude')

    spectrum = wave.make_spectrum()

    # 1: spectrum
    spectrum.plot_power(linewidth=1, alpha=0.5)
    thinkplot.save(root=root + 'noise1',
                   xlabel='frequency (Hz)',
                   ylabel='power density')

    slope, _, _, _, _ = spectrum.estimate_slope()
    print 'estimated slope', slope

    # 2: integrated spectrum
    integ = spectrum.make_integrated_spectrum()
    integ.plot_power()
    thinkplot.save(root=root + 'noise2',
                   xlabel='frequency (Hz)',
                   ylabel='normalized power')

    # 3: log-log spectral density
    spectrum.plot_power(low=1, linewidth=1, alpha=0.5)
    thinkplot.save(root=root + 'noise3',
                   xlabel='frequency (Hz)',
                   ylabel='power density',
                   xscale='log',
                   yscale='log')

    # 4: CDF of power density
    cdf = thinkstats2.MakeCdfFromList(spectrum.power)
    thinkplot.cdf(cdf)
    thinkplot.save(root=root + 'noise4', xlabel='power density', ylabel='CDF')

    # 5: CCDF of power density, log-y
    thinkplot.cdf(cdf, complement=True)
    thinkplot.save(root=root + 'noise5',
                   xlabel='power density',
                   ylabel='log(CCDF)',
                   yscale='log')

    thinkstats2.NormalProbabilityPlot(spectrum.real,
                                      label='real',
                                      data_color='#253494')
    thinkstats2.NormalProbabilityPlot(spectrum.imag - 50,
                                      label='imag-50',
                                      data_color='#1D91C0')
    thinkplot.save(root=root + 'noise6',
                   xlabel='normal sample',
                   ylabel='power density')
コード例 #10
0
ファイル: agemodel.py プロジェクト: wu12345/ThinkStats2
def Process(table, name):
    """Runs various analyses on this table.

    Creates instance variables:
        ages: sequence of int ages in years
        age_pmf: Pmf object
        age_cdf: Cdf object
        weights: sequence of total weight in ounces
        weight_cdf: Cdf object
    """
    cumulative.Process(table, name)

    table.ages = [p.agepreg for p in table.records
                  if p.agepreg != 'NA']
    table.age_pmf = thinkstats2.MakePmfFromList(table.ages, table.name)
    table.age_cdf = thinkstats2.MakeCdfFromList(table.ages, table.name)

    table.weights = [p.totalwgt_oz for p in table.records
                     if p.totalwgt_oz != 'NA']
    table.weight_cdf = thinkstats2.MakeCdfFromList(table.weights, table.name)
コード例 #11
0
def Simulate_Sample(lam, n, m=1000):
    means = []
    medians = []

    for _ in range(m):
        xs = np.random.exponential(1.0 / lam, n)
        L = 1 / np.mean(xs)
        means.append(L)

    cdf = thinkstats2.MakeCdfFromList(means)
    stderr = estimation.RMSE(means, lam)
    ci = cdf.Percentile(5), cdf.Percentile(95)
    return cdf, stderr, ci
コード例 #12
0
    def PValue(self, iters=1000):
        """Computes the sample distribution of the test statistic and p-value.

        iters: number of iterations

        returns: Cdf object, float p-value
        """
        self.sample_stats = [
            self.TestStatistic(self.RunModel()) for i in range(iters)
        ]
        self.sample_cdf = thinkstats2.MakeCdfFromList(self.sample_stats)

        p_value = 1 - self.sample_cdf.Prob(self.actual)
        return p_value
コード例 #13
0
def plot_power_density(root, spectrum):
    """
    """
    # 4: CDF of power density
    cdf = thinkstats2.MakeCdfFromList(spectrum.power)
    thinkplot.cdf(cdf)
    thinkplot.save(root=root + 'noise4', xlabel='power density', ylabel='CDF')

    # 5: CCDF of power density, log-y
    thinkplot.cdf(cdf, complement=True)
    thinkplot.save(root=root + 'noise5',
                   xlabel='power density',
                   ylabel='log(CCDF)',
                   yscale='log')
コード例 #14
0
ファイル: survival.py プロジェクト: wu12345/ThinkStats2
def PlotSurvival(durations):
    """Plots survival and hazard curves.

    durations: list of durations
    """
    cdf = thinkstats2.MakeCdfFromList(durations)
    thinkplot.Cdf(cdf, alpha=0.1)
    thinkplot.PrePlot(2)

    ts, ss = SurvivalFunction(cdf)

    thinkplot.Plot(ts, ss, label="S(t)")

    haz_func = HazardFunction(ts, ss)
    thinkplot.Pmf(haz_func, label='lam(t)')

    thinkplot.Show(xlabel='t (weeks)')
コード例 #15
0
ファイル: survival.py プロジェクト: wu12345/ThinkStats2
def PlotHazard(past, current):
    """Plots the hazard function and survival function.

    past: list of durations for complete pregnancies
    current: list of durations for current pregnancies
    """
    # plot S(t) based on only past pregnancies
    cdf = thinkstats2.MakeCdfFromList(past)
    ts, ss = SurvivalFunction(cdf)
    thinkplot.Plot(ts, ss, label='old S(t)', alpha=0.1)

    thinkplot.PrePlot(2)

    ts, lams = EstimateHazardFuncion(past, current)
    thinkplot.Plot(ts, lams, label='lams(t)', alpha=0.5)

    ts, ss = MakeSurvivalFromHazard(ts, lams)
    thinkplot.Plot(ts, ss, label='S(t)')
    thinkplot.Show(xlabel='t (weeks)')
コード例 #16
0
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000):

    means = []
    for j in range(m):
        xs = [random.gauss(mu, sigma) for i in range(n)]
        xbar = numpy.mean(xs)
        means.append(xbar)

    print 'rmse', RMSE(means, mu)

    cdf = thinkstats2.MakeCdfFromList(means)
    print 'confidence interval', cdf.Percentile(5), cdf.Percentile(95)

    pdf = thinkstats2.EstimatedPdf(means)
    stderr = sigma / math.sqrt(n)
    vals = numpy.linspace(mu - 3 * stderr, mu + 3 * stderr, 101)
    pmf = pdf.MakePmf(vals)
    #thinkplot.Pmf(pmf)

    thinkplot.Cdf(cdf)
    thinkplot.Show()
コード例 #17
0
    def testCdf(self):
        t = [1, 2, 2, 3, 5]
        pmf = thinkstats2.Pmf(t)
        hist = thinkstats2.Hist(t)

        cdf = thinkstats2.Cdf(pmf)
        self.assertEqual(len(str(cdf)), 37)

        self.assertEqual(cdf[0], 0)
        self.assertAlmostEqual(cdf[1], 0.2)
        self.assertAlmostEqual(cdf[2], 0.6)
        self.assertAlmostEqual(cdf[3], 0.8)
        self.assertAlmostEqual(cdf[4], 0.8)
        self.assertAlmostEqual(cdf[5], 1)
        self.assertAlmostEqual(cdf[6], 1)

        xs = range(7)
        ps = cdf.Probs(xs)
        for p1, p2 in zip(ps, [0, 0.2, 0.6, 0.8, 0.8, 1, 1]):
            self.assertAlmostEqual(p1, p2)

        self.assertEqual(cdf.Value(0), 1)
        self.assertEqual(cdf.Value(0.1), 1)
        self.assertEqual(cdf.Value(0.2), 1)
        self.assertEqual(cdf.Value(0.3), 2)
        self.assertEqual(cdf.Value(0.4), 2)
        self.assertEqual(cdf.Value(0.5), 2)
        self.assertEqual(cdf.Value(0.6), 2)
        self.assertEqual(cdf.Value(0.7), 3)
        self.assertEqual(cdf.Value(0.8), 3)
        self.assertEqual(cdf.Value(0.9), 5)
        self.assertEqual(cdf.Value(1), 5)

        ps = np.linspace(0, 1, 11)
        xs = cdf.ValueArray(ps)
        self.assertTrue((xs == [1, 1, 1, 2, 2, 2, 2, 3, 3, 5, 5]).all())

        np.random.seed(17)
        xs = cdf.Sample(7)
        self.assertListEqual(xs.tolist(), [2, 2, 1, 1, 3, 3, 3])

        # when you make a Cdf from a Pdf, you might get some floating
        # point representation error
        self.assertEqual(len(cdf), 4)
        self.assertAlmostEqual(cdf.Prob(2), 0.6)
        self.assertAlmostEqual(cdf[2], 0.6)
        self.assertEqual(cdf.Value(0.6), 2)

        cdf = thinkstats2.MakeCdfFromPmf(pmf)
        self.assertEqual(len(cdf), 4)
        self.assertAlmostEqual(cdf.Prob(2), 0.6)
        self.assertEqual(cdf.Value(0.6), 2)

        cdf = thinkstats2.MakeCdfFromItems(pmf.Items())
        self.assertEqual(len(cdf), 4)
        self.assertAlmostEqual(cdf.Prob(2), 0.6)
        self.assertEqual(cdf.Value(0.6), 2)

        cdf = thinkstats2.Cdf(pmf.d)
        self.assertEqual(len(cdf), 4)
        self.assertAlmostEqual(cdf.Prob(2), 0.6)
        self.assertEqual(cdf.Value(0.6), 2)

        cdf = thinkstats2.MakeCdfFromDict(pmf.d)
        self.assertEqual(len(cdf), 4)
        self.assertAlmostEqual(cdf.Prob(2), 0.6)
        self.assertEqual(cdf.Value(0.6), 2)

        cdf = thinkstats2.Cdf(hist)
        self.assertEqual(len(cdf), 4)
        self.assertEqual(cdf.Prob(2), 0.6)
        self.assertEqual(cdf.Value(0.6), 2)

        cdf = thinkstats2.MakeCdfFromHist(hist)
        self.assertEqual(len(cdf), 4)
        self.assertEqual(cdf.Prob(2), 0.6)
        self.assertEqual(cdf.Value(0.6), 2)

        cdf = thinkstats2.Cdf(t)
        self.assertEqual(len(cdf), 4)
        self.assertEqual(cdf.Prob(2), 0.6)
        self.assertEqual(cdf.Value(0.6), 2)

        cdf = thinkstats2.MakeCdfFromList(t)
        self.assertEqual(len(cdf), 4)
        self.assertEqual(cdf.Prob(2), 0.6)
        self.assertEqual(cdf.Value(0.6), 2)

        cdf = thinkstats2.Cdf(Counter(t))
        self.assertEqual(len(cdf), 4)
        self.assertEqual(cdf.Prob(2), 0.6)
        self.assertEqual(cdf.Value(0.6), 2)

        cdf2 = cdf.Copy()
        self.assertEqual(cdf2.Prob(2), 0.6)
        self.assertEqual(cdf2.Value(0.6), 2)
コード例 #18
0
def Medinan(xs):
    cdf = thinkstats2.MakeCdfFromList(xs)
    return cdf.Value(0.5)
コード例 #19
0
    def testCdf(self):
        t = [1, 2, 2, 3, 5]
        pmf = thinkstats2.Pmf(t)
        hist = thinkstats2.Hist(t)

        cdf = thinkstats2.Cdf(pmf)
        self.assertEquals(len(str(cdf)), 40)

        # when you make a Cdf from a Pdf, you might get some floating
        # point representation error
        self.assertEquals(len(cdf), 4)
        self.assertAlmostEquals(cdf.Prob(2), 0.6)
        self.assertAlmostEquals(cdf[2], 0.6)
        self.assertEquals(cdf.Value(0.6), 2)

        cdf = thinkstats2.MakeCdfFromPmf(pmf)
        self.assertEquals(len(cdf), 4)
        self.assertAlmostEquals(cdf.Prob(2), 0.6)
        self.assertEquals(cdf.Value(0.6), 2)

        cdf = thinkstats2.Cdf(pmf.Items())
        self.assertEquals(len(cdf), 4)
        self.assertAlmostEquals(cdf.Prob(2), 0.6)
        self.assertEquals(cdf.Value(0.6), 2)

        cdf = thinkstats2.MakeCdfFromItems(pmf.Items())
        self.assertEquals(len(cdf), 4)
        self.assertAlmostEquals(cdf.Prob(2), 0.6)
        self.assertEquals(cdf.Value(0.6), 2)

        cdf = thinkstats2.Cdf(pmf.d)
        self.assertEquals(len(cdf), 4)
        self.assertAlmostEquals(cdf.Prob(2), 0.6)
        self.assertEquals(cdf.Value(0.6), 2)

        cdf = thinkstats2.MakeCdfFromDict(pmf.d)
        self.assertEquals(len(cdf), 4)
        self.assertAlmostEquals(cdf.Prob(2), 0.6)
        self.assertEquals(cdf.Value(0.6), 2)

        cdf = thinkstats2.Cdf(hist)
        self.assertEquals(len(cdf), 4)
        self.assertEquals(cdf.Prob(2), 0.6)
        self.assertEquals(cdf.Value(0.6), 2)

        cdf = thinkstats2.MakeCdfFromHist(hist)
        self.assertEquals(len(cdf), 4)
        self.assertEquals(cdf.Prob(2), 0.6)
        self.assertEquals(cdf.Value(0.6), 2)

        cdf = thinkstats2.Cdf(t)
        self.assertEquals(len(cdf), 4)
        self.assertEquals(cdf.Prob(2), 0.6)
        self.assertEquals(cdf.Value(0.6), 2)

        cdf = thinkstats2.MakeCdfFromList(t)
        self.assertEquals(len(cdf), 4)
        self.assertEquals(cdf.Prob(2), 0.6)
        self.assertEquals(cdf.Value(0.6), 2)

        cdf = thinkstats2.Cdf(Counter(t))
        self.assertEquals(len(cdf), 4)
        self.assertEquals(cdf.Prob(2), 0.6)
        self.assertEquals(cdf.Value(0.6), 2)

        cdf2 = cdf.Copy()
        self.assertEquals(cdf2.Prob(2), 0.6)
        self.assertEquals(cdf2.Value(0.6), 2)
コード例 #20
0
    weights = live.totalwgt_lb
    cdf = thinkstats2.Cdf(weights, label='totalwgt_lb')
    sample = np.random.choice(weights, 1000, replace=True)
    ranks = [cdf.PercentileRank(x) for x in sample]

    rank_cdf = thinkstats2.Cdf(ranks)
    thinkplot.Cdf(rank_cdf)
    thinkplot.Show(xlabel='percentile rank', ylabel='CDF')

    ## my birth weight
    my_weight = 8 + 4 / 16
    my_rank = first_cdf.PercentileRank(my_weight)
    print('my_rank:\n', my_rank)
    calc_weight = first_cdf.Value(my_rank / 100)
    print('calc_weight:\n', calc_weight)

    ## observe random number distribution
    uni = []
    gauss = []
    for i in range(10000):
        uni.append(random.random())
        gauss.append(np.random.normal())

    uni_cdf = thinkstats2.MakeCdfFromList(uni, label='uniform')
    gauss_cdf = thinkstats2.MakeCdfFromList(gauss, label='gauss')

    thinkplot.PrePlot(2)
    thinkplot.Cdf(uni_cdf)
    thinkplot.Cdf(gauss_cdf)
    thinkplot.Show(xlabel='value', ylabel='CDF')
コード例 #21
0
def main(script, filename='data'):
    t = read_file(filename)
    cdf = thinkstats2.MakeCdfFromList(t)
    thinkplot.Cdf(cdf)
    thinkplot.Show()