def DifferenceInMeans(firsts, others, attr): """Compute the difference in means between tables for a given attr. Prints summary statistics. """ firsts_mean = thinkstats2.Mean(getattr(firsts, attr)) print 'First babies, %s, trimmed mean:' % attr, firsts_mean others_mean = thinkstats2.Mean(getattr(others, attr)) print 'Other babies, %s, trimmed mean:' % attr, others_mean diff = others_mean - firsts_mean print 'Difference in means:', diff print return diff
def Summarize(estimates, actual=None): mean = thinkstats2.Mean(estimates) stderr = thinkstats2.Std(estimates, mu=actual) cdf = thinkstats2.Cdf(estimates) ci = cdf.ConfidenceInterval(90) print('mean: ', mean, '\nSE: ',stderr, '\nCI: ', ci)
def main(): live, firsts, others = first.MakeFrames() diffs = PairwiseDiff(live) mean = thinkstats2.Mean(diffs) print('Mean: ', mean) pmf = thinkstats2.Pmf(diffs) thinkplot.Hist(pmf) thinkplot.Show(xlabel='Diff in wks', ylabel='PMF')
def testVar(self): t = [1, 1, 1, 3, 3, 591] mean = thinkstats2.Mean(t) var1 = thinkstats2.Var(t) var2 = thinkstats2.Var(t, mean) self.assertAlmostEqual(mean, 100.0) self.assertAlmostEqual(var1, 48217.0) self.assertAlmostEqual(var2, 48217.0)
def Summarize(estimates, actual=None): """Prints standard error and 90% confidence interval. estimates: sequence of estimates actual: float actual value """ mean = thinkstats2.Mean(estimates) stderr = thinkstats2.Std(estimates, mu=actual) cdf = thinkstats2.Cdf(estimates) ci = cdf.ConfidenceInterval(90) print('mean, SE, CI', mean, stderr, ci)
def MakeLinePlot(age_bins): xs = [] ys = [] for bin, weights in sorted(age_bins.iteritems()): xs.append(bin) ys.append(thinkstats2.Mean(weights)) thinkplot.Plot(xs, ys, 'bs-') thinkplot.Save(root='agemodel_line', xlabel="Mother's age (years)", ylabel='Mean birthweight (oz)', legend=False)
def PairWiseDifferences(live): live = live[live.prglngth >= 37] preg_map = nsfg.MakePregMap(live) diffs = [] for caseid, indices in preg_map.items(): lengths = live.loc[indices].prglngth.values if len(lengths) >= 2: diffs.extend(Diffs(lengths)) mean = thinkstats2.Mean(diffs) print('Mean difference between pairs', mean) pmf = thinkstats2.Pmf(diffs) thinkplot.Hist(pmf, align='center') thinkplot.Show(xlabel='Difference in weeks', ylabel='PMF')
def Partition(ages, weights, bin_size=2): """Break ages into bins. Returns a map from age to list of weights. """ weight_dict = {} for age, weight in zip(ages, weights): bin = bin_size * math.floor(age / bin_size) + bin_size/2.0 weight_dict.setdefault(bin, []).append(weight) for bin, bin_weights in weight_dict.iteritems(): try: mean = thinkstats2.Mean(bin_weights) except ZeroDivisionError: continue return weight_dict
def describe_inc_dist(log_upper): log_sample = hinc2.InterpolateSample(df, log_upper=j) incomes = np.power(10, log_sample) inc_mean = thinkstats2.Mean(incomes) inc_med = thinkstats2.Median(incomes) inc_skew = thinkstats2.Skewness(incomes) inc_pearskew = thinkstats2.PearsonMedianSkewness(incomes) print('log_upper = ', j) print('Mean Income: ', inc_mean) print('Median Income: ', inc_med) print('Skewness: ', inc_skew) print('Pearson Median Skewness: ', inc_pearskew) cdf = thinkstats2.Cdf(incomes) inc_below_mean = cdf.Prob(inc_mean) print('Pct. below mean: ', inc_below_mean) print('\n')
def main(name, data_dir='.'): random.seed(17) xs, ys = ReadData(data_dir) inter = thinkstats2.Mean(ys) slope = 0 fxs, fys = thinkstats2.FitLine(xs, inter, slope) res = thinkstats2.Residuals(xs, ys, inter, slope) inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000) thinkplot.Cdf(slope_cdf) thinkplot.Save(root='regress1', xlabel='Estimated slope (oz/year)', ylabel='CDF', title='Sampling distribution') return inter, slope = thinkstats2.LeastSquares(xs, ys) print 'inter', inter print 'slope', slope fxs, fys = thinkstats2.FitLine(xs, inter, slope) i = len(fxs) / 2 print 'median weight, age', fxs[i], fys[i] res = thinkstats2.Residuals(xs, ys, inter, slope) R2 = thinkstats2.CoefDetermination(ys, res) print 'R2', R2 print 'R', math.sqrt(R2) #thinkplot.Plot(fxs, fys, color='gray', alpha=0.5) #thinkplot.Scatter(xs, ys, alpha=0.05) #thinkplot.Show() inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000) thinkplot.Cdf(slope_cdf) thinkplot.Save(root='regress1', xlabel='Estimated slope (oz/year)', ylabel='CDF', title='Sampling distribution')
def PairWiseDifferences(live): """Summarize pairwise differences for children of the same mother. live: DataFrame of pregnancy records for live births """ live = live[live.prglngth >= 37] preg_map = nsfg.MakePregMap(live) diffs = [] for caseid, indices in preg_map.items(): lengths = live.loc[indices].prglngth.values if len(lengths) >= 2: diffs.extend(Diffs(lengths)) mean = thinkstats2.Mean(diffs) print('Mean difference between pairs', mean) pmf = thinkstats2.Pmf(diffs) thinkplot.Hist(pmf, align='center') thinkplot.Show(xlabel='Difference in weeks', ylabel='PMF')
import thinkplot def Diffs(t): first = t[0] rest = t[1:] diffs = [first - x for x in rest] return diffs def PairWiseDifference(live): live = live[live.prglngth >= 37] preg_map = nsfg.MakePregMap(live) diffs = [] for caseid, indicies in preg_map.items(): lengths = live.loc[indicies].prglngth.values if len(lengths) >= 2: diffs.extend(Diffs(lengths)) return diffs if __name__ == '__main__': live, first, other = first.MakeFrames() diffs = PairWiseDifference(live) mean = thinkstats2.Mean(diffs) print('Mean difference between pairs', mean) pmf = thinkstats2.Pmf(diffs) thinkplot.Hist(pmf, align="center") thinkplot.Show(xlabel='Difference in weeks', ylabel='PMF')
greq = preg[preg.agepreg >= 30] less = preg[preg.agepreg < 30] assert len(greq) == 2635 assert len(less) == 10606 return greq, less def MakePdfs(greq, less): greqpdf = thinkstats2.EstimatedPdf(greq.totalwgt_lb.dropna()) lesspdf = thinkstats2.EstimatedPdf(less.totalwgt_lb.dropna()) thinkplot.PrePlot(rows=1, cols=2) thinkplot.SubPlot(1) thinkplot.Pdf(greqpdf, label='greater/equal to 30') thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF') thinkplot.SubPlot(2) thinkplot.Pdf(lesspdf, label='less than 30') thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF') thinkplot.Show() greq, less = MakeFrames() MakePdfs(greq, less) print "greater/equal to 30 skew:", thinkstats2.Skewness(greq.totalwgt_lb.dropna()) print "less than 30 skew:", thinkstats2.Skewness(less.totalwgt_lb.dropna()) print "greater/equal to 30 mean:", thinkstats2.Mean(greq.totalwgt_lb.dropna()) print "greater/equal to 30 median:", thinkstats2.Median(greq.totalwgt_lb.dropna()) print "less than 30 mean:", thinkstats2.Mean(less.totalwgt_lb.dropna()) print "less than 30 median:", thinkstats2.Median(less.totalwgt_lb.dropna())
cdf = thinkstats2.Cdf(slopes) thinkplot.Cdf(cdf) #%% # Compute the p-value of the slope. pvalue = cdf[0] pvalue #%% # 90% confidence interval of slope ci = cdf.Percentile(5), cdf.Percentile(95) ci #%% # get the mean of the new sampling dist. mean = thinkstats2.Mean(slopes) mean #%% # Compute the standard deviation of the sampling distribution, which is the standard error. stderr = thinkstats2.Std(slopes) stderr #%% # From ThinkStats # The following function takes a list of estimates and prints the mean, standard error, and 90% confidence interval. def Summarize(estimates, actual=None): mean = thinkstats2.Mean(estimates) stderr = thinkstats2.Std(estimates, mu=actual) cdf = thinkstats2.Cdf(estimates)
def testMean(self): t = [1, 1, 1, 3, 3, 591] mean = thinkstats2.Mean(t) self.assertEqual(mean, 100)
def Summarize(estimates, actual=None): mean = thinkstats2.Mean(estimates) stderr = thinkstats2.Std(estimates, mu=actual) cdf = thinkstats2.Cdf(estimates) ci = cdf.ConfidenceInterval(90) print('mean: {:.3f} SE: {:.3f} CI: {}'.format(mean, stderr, ci))
def MeanDiagDate(self): xs = [r.diagdate for r in self.records] return thinkstats2.Mean(xs)
def PlotSamplingDistributions(live): """Plots confidence intervals for the fitted curve and sampling dists. live: DataFrame """ ages = live.agepreg weights = live.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) res = thinkstats2.Residuals(ages, weights, inter, slope) r2 = thinkstats2.CoefDetermination(weights, res) print('rho', thinkstats2.Corr(ages, weights)) print('R2', r2) print('R', math.sqrt(r2)) print('Std(ys)', thinkstats2.Std(weights)) print('Std(res)', thinkstats2.Std(res)) # plot the confidence intervals inters, slopes = SamplingDistributions(live, iters=1001) PlotConfidenceIntervals(ages, inters, slopes, percent=90, alpha=0.3, label='90% CI') thinkplot.Text(42, 7.53, '90%') PlotConfidenceIntervals(ages, inters, slopes, percent=50, alpha=0.5, label='50% CI') thinkplot.Text(42, 7.59, '50%') thinkplot.Save(root='linear3', xlabel='age (years)', ylabel='birth weight (lbs)', legend=False) # plot the confidence intervals thinkplot.PrePlot(2) thinkplot.Scatter(ages, weights, color='gray', alpha=0.1) PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2) PlotConfidenceIntervals(ages, inters, slopes) thinkplot.Save(root='linear5', xlabel='age (years)', ylabel='birth weight (lbs)', title='90% CI', axis=[10, 45, 0, 15], legend=False) # plot the sampling distribution of slope under null hypothesis # and alternate hypothesis sampling_cdf = thinkstats2.Cdf(slopes) print('p-value, sampling distribution', sampling_cdf[0]) ht = SlopeTest((ages, weights)) pvalue = ht.PValue() print('p-value, slope test', pvalue) print('inter', inter, thinkstats2.Mean(inters)) Summarize(inters, inter) print('slope', slope, thinkstats2.Mean(slopes)) Summarize(slopes, slope) thinkplot.PrePlot(2) thinkplot.Plot([0, 0], [0, 1], color='0.8') ht.PlotCdf(label='null hypothesis') thinkplot.Cdf(sampling_cdf, label='sampling distribution') thinkplot.Save(root='linear4', xlabel='slope (lbs / year)', ylabel='CDF', xlim=[-0.03, 0.03], loc='upper left')