def MakeHists(live): """Plot Hists for live births live: DataFrame others: DataFrame """ hist = mystats.Hist(live.birthwgt_lb, label="birthwgt_lb") myplots.Hist(hist) myplots.Save(root="first_wgt_lb_hist", xlabel="pounds", ylabel="frequency", axis=[-1, 14, 0, 3200]) hist = mystats.Hist(live.birthwgt_oz, label="birthwgt_oz") myplots.Hist(hist) myplots.Save(root="first_wgt_oz_hist", xlabel="ounces", ylabel="frequency", axis=[-1, 16, 0, 1200]) hist = mystats.Hist(np.floor(live.agepreg), label="agepreg") myplots.Hist(hist) myplots.Save(root="first_agepreg_hist", xlabel="years", ylabel="frequency") hist = mystats.Hist(live.prglngth, label="prglngth") myplots.Hist(hist) myplots.Save(root="first_prglngth_hist", xlabel="weeks", ylabel="frequency", axis=[-1, 53, 0, 5000])
def RunTests(data, iters=1000): # test the difference in means ht = DiffMeansPermute(data) p_value = ht.PValue(iters=iters) print("\nmeans permute two-sided") PrintTest(p_value, ht) ht.PlotCdf() myplots.Save( root="hypothesis1", title="Permutation test", xlabel="difference in means (weeks)", ylabel="CDF", legend=False, ) # test the difference in means one-sided ht = DiffMeansOneSided(data) p_value = ht.PValue(iters=iters) print("\nmeans permute one-sided") PrintTest(p_value, ht) # test the difference in std ht = DiffStdPermute(data) p_value = ht.PValue(iters=iters) print("\nstd permute one-sided") PrintTest(p_value, ht)
def MakePdfExample(n=500): # mean and var of women's heights in cm, from the BRFSS mean, var = 163, 52.8 std = math.sqrt(var) print(PDF(mean + std, mean, std)) plotPDF(mean, std, n=n, label="normal") myplots.Save(root="pdf_example", xlabel="Height (cm)", ylabel="Density")
def MakeFigures(df): sample = mystats.SampleRows(df, 5000) # simple scatter plot myplots.PrePlot(cols=2) heights, weights = GetHeightWeight(sample) ScatterPlot(heights, weights) # scatter plot with jitter myplots.SubPlot(2) heights, weights = GetHeightWeight(sample, hjitter=1.3, wjitter=0.5) ScatterPlot(heights, weights) myplots.Save(root="scatter1") # with jitter and transparency myplots.PrePlot(cols=2) ScatterPlot(heights, weights, alpha=0.1) # hexbin plot myplots.SubPlot(2) heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=0.5) HexBin(heights, weights) myplots.Save(root="scatter2")
def ComputeSkewnesses(): def VertLine(x, y): myplots.Plot([x, x], [0, y], color="0.6", linewidth=1) live, firsts, others = first.MakeFrames() data = live.totalwgt_lb.dropna() print("Birth weight") mean, median = Summarize(data) y = 0.35 VertLine(mean, y) myplots.Text(mean - 0.15, 0.1 * y, "mean", horizontalalignment="right") VertLine(median, y) myplots.Text(median + 0.1, 0.1 * y, "median", horizontalalignment="left") pdf = mystats.EstimatedPdf(data) myplots.Pdf(pdf, label="birth weight") myplots.Save(root="density_totalwgt_kde", xlabel="lbs", ylabel="PDF") df = brfss.ReadBrfss(nrows=None) data = df.wtkg2.dropna() print("Adult weight") mean, median = Summarize(data) y = 0.02499 VertLine(mean, y) myplots.Text(mean + 1, 0.1 * y, "mean", horizontalalignment="left") VertLine(median, y) myplots.Text(median - 1.5, 0.1 * y, "median", horizontalalignment="right") pdf = mystats.EstimatedPdf(data) myplots.Pdf(pdf, label="adult weight") myplots.Save(root="density_wtkg2_kde", xlabel="kg", ylabel="PDF", xlim=[0, 200])
def BinnedPercentiles(df): cdf = mystats.Cdf(df.htm3) print("Fraction between 140 and 200 cm", cdf[200] - cdf[140]) bins = np.arange(135, 210, 5) indices = np.digitize(df.htm3, bins) groups = df.groupby(indices) heights = [group.htm3.mean() for i, group in groups][1:-1] cdfs = [mystats.Cdf(group.wtkg2) for i, group in groups][1:-1] myplots.PrePlot(3) for percent in [75, 50, 25]: weights = [cdf.Percentile(percent) for cdf in cdfs] label = "%dth" % percent myplots.Plot(heights, weights, label=label) myplots.Save(root="scatter3", xlabel="height (cm)", ylabel="weight (kg)")
def PrintExtremes(live): """Plots the histogram of pregnancy lengths and prints the extremes. live: DataFrame of live births """ hist = mystats.Hist(live.prglngth) myplots.Hist(hist, label="live births") myplots.Save(root="first_nsfg_hist_live", title="Histogram", xlabel="weeks", ylabel="frequency") print("Shortest lengths:") for weeks, freq in hist.Smallest(10): print(weeks, freq) print("Longest lengths:") for weeks, freq in hist.Largest(10): print(weeks, freq)
def MakeComparison(firsts, others): """Plots histograms of pregnancy length for first babies and others. firsts: DataFrame others: DataFrame """ first_hist = mystats.Hist(firsts.prglngth, label="first") other_hist = mystats.Hist(others.prglngth, label="other") width = 0.45 myplots.PrePlot(2) myplots.Hist(first_hist, align="right", width=width) myplots.Hist(other_hist, align="left", width=width) myplots.Save( root="first_nsfg_hist", title="Histogram", xlabel="weeks", ylabel="frequency", axis=[27, 46, 0, 2700], )
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000): def VertLine(x, y=1): myplots.Plot([x, x], [0, y], color="0.8", linewidth=3) means = [] for _ in range(m): xs = np.random.normal(mu, sigma, n) xbar = np.mean(xs) means.append(xbar) stderr = RMSE(means, mu) print("standard error", stderr) cdf = mystats.Cdf(means) ci = cdf.Percentile(5), cdf.Percentile(95) print("confidence interval", ci) VertLine(ci[0]) VertLine(ci[1]) # plot the CDF myplots.Cdf(cdf) myplots.Save(root="estimation1", xlabel="sample mean", ylabel="CDF", title="Sampling distribution")