Beispiel #1
0
def DifferenceInMeans(firsts, others, attr):
    """Compute the difference in means between tables for a given attr.

    Prints summary statistics.
    """
    firsts_mean = thinkstats2.Mean(getattr(firsts, attr))
    print 'First babies, %s, trimmed mean:' % attr, firsts_mean

    others_mean = thinkstats2.Mean(getattr(others, attr))
    print 'Other babies, %s, trimmed mean:' % attr, others_mean

    diff = others_mean - firsts_mean
    print 'Difference in means:', diff
    print

    return diff
def Summarize(estimates, actual=None):
    mean = thinkstats2.Mean(estimates)
    stderr = thinkstats2.Std(estimates, mu=actual)
    cdf = thinkstats2.Cdf(estimates)
    ci = cdf.ConfidenceInterval(90)
    print('mean: ', mean,  
          '\nSE: ',stderr, 
          '\nCI: ', ci)
Beispiel #3
0
def main():
    live, firsts, others = first.MakeFrames()
    diffs = PairwiseDiff(live)
    mean = thinkstats2.Mean(diffs)
    print('Mean: ', mean)
    pmf = thinkstats2.Pmf(diffs)
    thinkplot.Hist(pmf)
    thinkplot.Show(xlabel='Diff in wks', ylabel='PMF')
    def testVar(self):
        t = [1, 1, 1, 3, 3, 591]
        mean = thinkstats2.Mean(t)
        var1 = thinkstats2.Var(t)
        var2 = thinkstats2.Var(t, mean)

        self.assertAlmostEqual(mean, 100.0)
        self.assertAlmostEqual(var1, 48217.0)
        self.assertAlmostEqual(var2, 48217.0)
Beispiel #5
0
def Summarize(estimates, actual=None):
    """Prints standard error and 90% confidence interval.

    estimates: sequence of estimates
    actual: float actual value
    """
    mean = thinkstats2.Mean(estimates)
    stderr = thinkstats2.Std(estimates, mu=actual)
    cdf = thinkstats2.Cdf(estimates)
    ci = cdf.ConfidenceInterval(90)
    print('mean, SE, CI', mean, stderr, ci)
Beispiel #6
0
def MakeLinePlot(age_bins):
    xs = []
    ys = []
    for bin, weights in sorted(age_bins.iteritems()):
        xs.append(bin)
        ys.append(thinkstats2.Mean(weights))

    thinkplot.Plot(xs, ys, 'bs-')
    thinkplot.Save(root='agemodel_line',
                xlabel="Mother's age (years)",
                ylabel='Mean birthweight (oz)',
                legend=False)
Beispiel #7
0
def PairWiseDifferences(live):
    live = live[live.prglngth >= 37]
    preg_map = nsfg.MakePregMap(live)

    diffs = []
    for caseid, indices in preg_map.items():
        lengths = live.loc[indices].prglngth.values
        if len(lengths) >= 2:
            diffs.extend(Diffs(lengths))

    mean = thinkstats2.Mean(diffs)
    print('Mean difference between pairs', mean)

    pmf = thinkstats2.Pmf(diffs)
    thinkplot.Hist(pmf, align='center')
    thinkplot.Show(xlabel='Difference in weeks', ylabel='PMF')
Beispiel #8
0
def Partition(ages, weights, bin_size=2):
    """Break ages into bins.

    Returns a map from age to list of weights.
    """
    weight_dict = {}
    for age, weight in zip(ages, weights):
        bin = bin_size * math.floor(age / bin_size) + bin_size/2.0
        weight_dict.setdefault(bin, []).append(weight)

    for bin, bin_weights in weight_dict.iteritems():
        try:
            mean = thinkstats2.Mean(bin_weights)
        except ZeroDivisionError:
            continue

    return weight_dict
Beispiel #9
0
def describe_inc_dist(log_upper):
    log_sample = hinc2.InterpolateSample(df, log_upper=j)
    incomes = np.power(10, log_sample)

    inc_mean = thinkstats2.Mean(incomes)
    inc_med = thinkstats2.Median(incomes)
    inc_skew = thinkstats2.Skewness(incomes)
    inc_pearskew = thinkstats2.PearsonMedianSkewness(incomes)
    print('log_upper = ', j)
    print('Mean Income: ', inc_mean)
    print('Median Income: ', inc_med)
    print('Skewness: ', inc_skew)
    print('Pearson Median Skewness: ', inc_pearskew)

    cdf = thinkstats2.Cdf(incomes)
    inc_below_mean = cdf.Prob(inc_mean)
    print('Pct. below mean: ', inc_below_mean)
    print('\n')
Beispiel #10
0
def main(name, data_dir='.'):
    random.seed(17)

    xs, ys = ReadData(data_dir)
    inter = thinkstats2.Mean(ys)
    slope = 0
    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    res = thinkstats2.Residuals(xs, ys, inter, slope)

    inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000)

    thinkplot.Cdf(slope_cdf)
    thinkplot.Save(root='regress1',
                   xlabel='Estimated slope (oz/year)',
                   ylabel='CDF',
                   title='Sampling distribution')

    return

    inter, slope = thinkstats2.LeastSquares(xs, ys)
    print 'inter', inter
    print 'slope', slope

    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    i = len(fxs) / 2
    print 'median weight, age', fxs[i], fys[i]

    res = thinkstats2.Residuals(xs, ys, inter, slope)
    R2 = thinkstats2.CoefDetermination(ys, res)
    print 'R2', R2
    print 'R', math.sqrt(R2)

    #thinkplot.Plot(fxs, fys, color='gray', alpha=0.5)
    #thinkplot.Scatter(xs, ys, alpha=0.05)
    #thinkplot.Show()

    inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000)
    thinkplot.Cdf(slope_cdf)
    thinkplot.Save(root='regress1',
                   xlabel='Estimated slope (oz/year)',
                   ylabel='CDF',
                   title='Sampling distribution')
Beispiel #11
0
def PairWiseDifferences(live):
    """Summarize pairwise differences for children of the same mother.

    live: DataFrame of pregnancy records for live births
    """
    live = live[live.prglngth >= 37]
    preg_map = nsfg.MakePregMap(live)

    diffs = []
    for caseid, indices in preg_map.items():
        lengths = live.loc[indices].prglngth.values
        if len(lengths) >= 2:
            diffs.extend(Diffs(lengths))

    mean = thinkstats2.Mean(diffs)
    print('Mean difference between pairs', mean)

    pmf = thinkstats2.Pmf(diffs)
    thinkplot.Hist(pmf, align='center')
    thinkplot.Show(xlabel='Difference in weeks', ylabel='PMF')
Beispiel #12
0
import thinkplot


def Diffs(t):
    first = t[0]
    rest = t[1:]
    diffs = [first - x for x in rest]
    return diffs


def PairWiseDifference(live):
    live = live[live.prglngth >= 37]
    preg_map = nsfg.MakePregMap(live)
    diffs = []
    for caseid, indicies in preg_map.items():
        lengths = live.loc[indicies].prglngth.values
        if len(lengths) >= 2:
            diffs.extend(Diffs(lengths))
    return diffs


if __name__ == '__main__':
    live, first, other = first.MakeFrames()
    diffs = PairWiseDifference(live)
    mean = thinkstats2.Mean(diffs)
    print('Mean difference between pairs', mean)

    pmf = thinkstats2.Pmf(diffs)
    thinkplot.Hist(pmf, align="center")
    thinkplot.Show(xlabel='Difference in weeks', ylabel='PMF')
Beispiel #13
0
    greq = preg[preg.agepreg >= 30]
    less = preg[preg.agepreg < 30]

    assert len(greq) == 2635
    assert len(less) == 10606

    return greq, less


def MakePdfs(greq, less):
    greqpdf = thinkstats2.EstimatedPdf(greq.totalwgt_lb.dropna())
    lesspdf = thinkstats2.EstimatedPdf(less.totalwgt_lb.dropna())
    thinkplot.PrePlot(rows=1, cols=2)
    thinkplot.SubPlot(1)
    thinkplot.Pdf(greqpdf, label='greater/equal to 30')
    thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF')
    thinkplot.SubPlot(2)
    thinkplot.Pdf(lesspdf, label='less than 30')
    thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF')
    thinkplot.Show()


greq, less = MakeFrames()
MakePdfs(greq, less)
print "greater/equal to 30 skew:", thinkstats2.Skewness(greq.totalwgt_lb.dropna())
print "less than 30 skew:", thinkstats2.Skewness(less.totalwgt_lb.dropna())
print "greater/equal to 30 mean:", thinkstats2.Mean(greq.totalwgt_lb.dropna())
print "greater/equal to 30 median:", thinkstats2.Median(greq.totalwgt_lb.dropna())
print "less than 30 mean:", thinkstats2.Mean(less.totalwgt_lb.dropna())
print "less than 30 median:", thinkstats2.Median(less.totalwgt_lb.dropna())
cdf = thinkstats2.Cdf(slopes)
thinkplot.Cdf(cdf)

#%%
# Compute the p-value of the slope.
pvalue = cdf[0]
pvalue

#%%
# 90% confidence interval of slope
ci = cdf.Percentile(5), cdf.Percentile(95)
ci

#%%
# get the mean of the new sampling dist.
mean = thinkstats2.Mean(slopes)
mean

#%%
# Compute the standard deviation of the sampling distribution, which is the standard error.
stderr = thinkstats2.Std(slopes)
stderr


#%%
# From ThinkStats
# The following function takes a list of estimates and prints the mean, standard error, and 90% confidence interval.
def Summarize(estimates, actual=None):
    mean = thinkstats2.Mean(estimates)
    stderr = thinkstats2.Std(estimates, mu=actual)
    cdf = thinkstats2.Cdf(estimates)
 def testMean(self):
     t = [1, 1, 1, 3, 3, 591]
     mean = thinkstats2.Mean(t)
     self.assertEqual(mean, 100)
def Summarize(estimates, actual=None):
    mean = thinkstats2.Mean(estimates)
    stderr = thinkstats2.Std(estimates, mu=actual)
    cdf = thinkstats2.Cdf(estimates)
    ci = cdf.ConfidenceInterval(90)
    print('mean: {:.3f} SE: {:.3f} CI: {}'.format(mean, stderr, ci))
Beispiel #17
0
 def MeanDiagDate(self):
     xs = [r.diagdate for r in self.records]
     return thinkstats2.Mean(xs)
Beispiel #18
0
def PlotSamplingDistributions(live):
    """Plots confidence intervals for the fitted curve and sampling dists.

    live: DataFrame
    """
    ages = live.agepreg
    weights = live.totalwgt_lb
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    res = thinkstats2.Residuals(ages, weights, inter, slope)
    r2 = thinkstats2.CoefDetermination(weights, res)

    print('rho', thinkstats2.Corr(ages, weights))
    print('R2', r2)
    print('R', math.sqrt(r2))
    print('Std(ys)', thinkstats2.Std(weights))
    print('Std(res)', thinkstats2.Std(res))

    # plot the confidence intervals
    inters, slopes = SamplingDistributions(live, iters=1001)
    PlotConfidenceIntervals(ages,
                            inters,
                            slopes,
                            percent=90,
                            alpha=0.3,
                            label='90% CI')
    thinkplot.Text(42, 7.53, '90%')
    PlotConfidenceIntervals(ages,
                            inters,
                            slopes,
                            percent=50,
                            alpha=0.5,
                            label='50% CI')
    thinkplot.Text(42, 7.59, '50%')

    thinkplot.Save(root='linear3',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   legend=False)

    # plot the confidence intervals
    thinkplot.PrePlot(2)
    thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
    PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2)
    PlotConfidenceIntervals(ages, inters, slopes)
    thinkplot.Save(root='linear5',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   title='90% CI',
                   axis=[10, 45, 0, 15],
                   legend=False)

    # plot the sampling distribution of slope under null hypothesis
    # and alternate hypothesis
    sampling_cdf = thinkstats2.Cdf(slopes)
    print('p-value, sampling distribution', sampling_cdf[0])

    ht = SlopeTest((ages, weights))
    pvalue = ht.PValue()
    print('p-value, slope test', pvalue)

    print('inter', inter, thinkstats2.Mean(inters))
    Summarize(inters, inter)
    print('slope', slope, thinkstats2.Mean(slopes))
    Summarize(slopes, slope)

    thinkplot.PrePlot(2)
    thinkplot.Plot([0, 0], [0, 1], color='0.8')
    ht.PlotCdf(label='null hypothesis')
    thinkplot.Cdf(sampling_cdf, label='sampling distribution')
    thinkplot.Save(root='linear4',
                   xlabel='slope (lbs / year)',
                   ylabel='CDF',
                   xlim=[-0.03, 0.03],
                   loc='upper left')