def MakeFigures(): """Make scatterplots. """ thinkstats2.RandomSeed(17) df = brfss.ReadBrfss(nrows=None) sample = SampleRows(df, 5000, replace=False) heights, weights = GetHeightWeight(sample) assert (heights.values[100] == 175) assert (weights.values[100] == 86.36) ScatterPlot('brfss_scatter1', heights, weights) heights, weights = GetHeightWeight(sample, hjitter=1.5, wjitter=1.1) assert (int(heights.values[100]) == 173) assert (int(weights.values[100]) == 85) ScatterPlot('brfss_scatter2', heights, weights) ScatterPlot('brfss_scatter3', heights, weights, alpha=0.1) # make a hexbin of all records heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=1.1) assert (int(heights.values[100]) == 171) assert (int(weights.values[100]) == 55) HexBin('brfss_scatter4', heights, weights)
def main(script): thinkstats2.RandomSeed(17) df = brfss.ReadBrfss(nrows=None) df = df.dropna(subset=['htm3', 'wtkg2']) Correlations(df) return MakeFigures(df) BinnedPercentiles(df)
def main(script): random.seed(100) np.random.seed(100) df = brfss.ReadBrfss(nrows=None) df = df.dropna(subset=["htm3", "wtkg2"]) Correlations(df) MakeFigures(df) BinnedPercentiles(df)
def ReadHeights(nrows=None): """Read the BRFSS dataset, extract the heights and pickle them. nrows: number of rows to read """ resp = brfss.ReadBrfss(nrows=nrows).dropna(subset=['sex', 'htm3']) groups = resp.groupby('sex') d = {} for name, group in groups: d[name] = group.htm3.values return d
def main(name, nrows=None): thinkstats2.RandomSeed(17) if nrows is not None: nrows = int(nrows) df = brfss.ReadBrfss(nrows=nrows) columns = df[['htm3', 'wtkg2']].dropna() heights, weights = columns.htm3.values, columns.wtkg2.values TestCorrelation(heights, weights) if nrows == None: ComputeCorrelations(heights, weights)
def ComputeSkewnesses(): """Plots KDE of birthweight and adult weight. """ def VertLine(x, y): thinkplot.Plot([x, x], [0, y], color='0.6', linewidth=1) live, firsts, others = first.MakeFrames() data = live.totalwgt_lb.dropna() print('Birth weight') mean, median = Summarize(data) y = 0.35 VertLine(mean, y) thinkplot.Text(mean - 0.15, 0.1 * y, 'mean', horizontalalignment='right') VertLine(median, y) thinkplot.Text(median + 0.1, 0.1 * y, 'median', horizontalalignment='left') pdf = thinkstats2.EstimatedPdf(data) thinkplot.Pdf(pdf, label='birth weight') thinkplot.Save(root='density_totalwgt_kde', xlabel='lbs', ylabel='PDF') df = brfss.ReadBrfss(nrows=None) data = df.wtkg2.dropna() print('Adult weight') mean, median = Summarize(data) y = 0.02499 VertLine(mean, y) thinkplot.Text(mean + 1, 0.1 * y, 'mean', horizontalalignment='left') VertLine(median, y) thinkplot.Text(median - 1.5, 0.1 * y, 'median', horizontalalignment='right') pdf = thinkstats2.EstimatedPdf(data) thinkplot.Pdf(pdf, label='adult weight') thinkplot.Save(root='density_wtkg2_kde', xlabel='kg', ylabel='PDF', xlim=[0, 200])
def ComputeSkewnesses(): def VertLine(x, y): myplots.Plot([x, x], [0, y], color="0.6", linewidth=1) live, firsts, others = first.MakeFrames() data = live.totalwgt_lb.dropna() print("Birth weight") mean, median = Summarize(data) y = 0.35 VertLine(mean, y) myplots.Text(mean - 0.15, 0.1 * y, "mean", horizontalalignment="right") VertLine(median, y) myplots.Text(median + 0.1, 0.1 * y, "median", horizontalalignment="left") pdf = mystats.EstimatedPdf(data) myplots.Pdf(pdf, label="birth weight") myplots.Save(root="density_totalwgt_kde", xlabel="lbs", ylabel="PDF") df = brfss.ReadBrfss(nrows=None) data = df.wtkg2.dropna() print("Adult weight") mean, median = Summarize(data) y = 0.02499 VertLine(mean, y) myplots.Text(mean + 1, 0.1 * y, "mean", horizontalalignment="left") VertLine(median, y) myplots.Text(median - 1.5, 0.1 * y, "median", horizontalalignment="right") pdf = mystats.EstimatedPdf(data) myplots.Pdf(pdf, label="adult weight") myplots.Save(root="density_wtkg2_kde", xlabel="kg", ylabel="PDF", xlim=[0, 200])
thinkplot.Config(xlable='birth weight (pounds)', ylable='PDF') #%% mean = RawMoment(data, 1) print("mean: {:.2f} pounds".format(mean)) medi = Medinan(data) print("median: {:.2f} pounds".format(medi)) skewness = Skewness(data) print("skewness: {:.2f} ".format(skewness)) pearson = PearsonMedianSkewness(data) print("pearson's median skewness: {:.2f}".format(pearson)) #%% # BRFSS import brfss df = brfss.ReadBrfss(nrows=None) data = df.wtkg2.dropna() pdf = thinkstats2.EstimatedPdf(data) thinkplot.Pdf(pdf, label="adult weight") thinkplot.Config(xlable='weight (kg)', ylable='PDF') #%% pdf = thinkstats2.EstimatedPdf(data) thinkplot.Pdf(pdf, label="adult weight") thinkplot.Config(xlable='weight (kg)', ylable='PDF', xlim=[0, 200]) #%% mean = RawMoment(data, 1) print("mean: {:.1f} kg".format(mean)) medi = Medinan(data) print("median: {:.1f} kg".format(medi))
thinkplot.Plot(xs, ys, label='all live') xs, ys = thinkstats2.NormalProbability(term_weights) thinkplot.Plot(xs, ys, label='full term') thinkplot.Config(title='Normal probability plot', xlabel='Standard deviations from mean', ylabel='Birth weight (lbs)') #%% [markdown] # ## Lognormal model # # As an example of a lognormal disrtribution, we'll look at adult weights from the BRFSS. #%% import brfss df = brfss.ReadBrfss() weights = df.wtkg2.dropna() #%% [markdown] # The following function estimates the parameters of a normal distribution and plots the data and a normal model. #%% def MakeNormalModel(weights): """Plots a CDF with a Normal model. weights: sequence """ cdf = thinkstats2.Cdf(weights, label='weights') mean, var = thinkstats2.TrimmedMeanVar(weights)
def main(script): random.seed(100) np.random.seed(100) df = brfss.ReadBrfss(nrows=None) df = df.dropna(subset=["htm3", "wtkg2"])
class DiffMeansResample(h0.DiffMeansPermute): def RunModel(self): """ Goal: Use resampling to simulate test data Output: Simulated data """ group1 = np.random.choice(self.pool, self.n, replace=True) group2 = np.random.choice(self.pool, self.m, replace=True) data = group1, group2 return data preg = nsfg.ReadFemPreg() live = preg[preg["outcome"] == 1] resp = nsfg.ReadFemResp() bs = brfss.ReadBrfss() income = hinc.ReadData() log_intp_income = hinc2.InterpolateSample(income, log_upper=6.0) # Q1. Think Stats Chapter 2 Exercise 4 (effect size of Cohen's d) first_wt = live.loc[preg["birthord"] == 1, "totalwgt_lb"] other_wt = live.loc[preg["birthord"] != 1, "totalwgt_lb"] ts.CohenEffectSize(first_wt, other_wt) # Q2. Think Stats Chapter 3 Exercise 1 (actual vs. biased) d = np.diff(np.unique(resp["numkdhh"])).min() left_of_first_bin = resp["numkdhh"].min() - float(d) / 2 right_of_last_bin = resp["numkdhh"].max() + float(d) / 2 plt.clf() plt.hist(resp["numkdhh"], bins=np.arange(left_of_first_bin, right_of_last_bin + d, d),