Esempio n. 1
0
def MakeHists(live):
    """Plot Hists for live births

    live: DataFrame
    others: DataFrame
    """
    hist = mystats.Hist(live.birthwgt_lb, label="birthwgt_lb")
    myplots.Hist(hist)
    myplots.Save(root="first_wgt_lb_hist",
                 xlabel="pounds",
                 ylabel="frequency",
                 axis=[-1, 14, 0, 3200])

    hist = mystats.Hist(live.birthwgt_oz, label="birthwgt_oz")
    myplots.Hist(hist)
    myplots.Save(root="first_wgt_oz_hist",
                 xlabel="ounces",
                 ylabel="frequency",
                 axis=[-1, 16, 0, 1200])

    hist = mystats.Hist(np.floor(live.agepreg), label="agepreg")
    myplots.Hist(hist)
    myplots.Save(root="first_agepreg_hist", xlabel="years", ylabel="frequency")

    hist = mystats.Hist(live.prglngth, label="prglngth")
    myplots.Hist(hist)
    myplots.Save(root="first_prglngth_hist",
                 xlabel="weeks",
                 ylabel="frequency",
                 axis=[-1, 53, 0, 5000])
Esempio n. 2
0
def RunTests(data, iters=1000):
    # test the difference in means
    ht = DiffMeansPermute(data)
    p_value = ht.PValue(iters=iters)
    print("\nmeans permute two-sided")
    PrintTest(p_value, ht)

    ht.PlotCdf()
    myplots.Save(
        root="hypothesis1",
        title="Permutation test",
        xlabel="difference in means (weeks)",
        ylabel="CDF",
        legend=False,
    )

    # test the difference in means one-sided
    ht = DiffMeansOneSided(data)
    p_value = ht.PValue(iters=iters)
    print("\nmeans permute one-sided")
    PrintTest(p_value, ht)

    # test the difference in std
    ht = DiffStdPermute(data)
    p_value = ht.PValue(iters=iters)
    print("\nstd permute one-sided")
    PrintTest(p_value, ht)
Esempio n. 3
0
def MakePdfExample(n=500):
    # mean and var of women's heights in cm, from the BRFSS
    mean, var = 163, 52.8
    std = math.sqrt(var)

    print(PDF(mean + std, mean, std))
    plotPDF(mean, std, n=n, label="normal")

    myplots.Save(root="pdf_example", xlabel="Height (cm)", ylabel="Density")
Esempio n. 4
0
def MakeFigures(df):
    sample = mystats.SampleRows(df, 5000)

    # simple scatter plot
    myplots.PrePlot(cols=2)
    heights, weights = GetHeightWeight(sample)
    ScatterPlot(heights, weights)

    # scatter plot with jitter
    myplots.SubPlot(2)
    heights, weights = GetHeightWeight(sample, hjitter=1.3, wjitter=0.5)
    ScatterPlot(heights, weights)

    myplots.Save(root="scatter1")

    # with jitter and transparency
    myplots.PrePlot(cols=2)
    ScatterPlot(heights, weights, alpha=0.1)

    # hexbin plot
    myplots.SubPlot(2)
    heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=0.5)
    HexBin(heights, weights)
    myplots.Save(root="scatter2")
Esempio n. 5
0
def ComputeSkewnesses():
    def VertLine(x, y):
        myplots.Plot([x, x], [0, y], color="0.6", linewidth=1)

    live, firsts, others = first.MakeFrames()
    data = live.totalwgt_lb.dropna()
    print("Birth weight")
    mean, median = Summarize(data)

    y = 0.35
    VertLine(mean, y)
    myplots.Text(mean - 0.15, 0.1 * y, "mean", horizontalalignment="right")
    VertLine(median, y)
    myplots.Text(median + 0.1, 0.1 * y, "median", horizontalalignment="left")

    pdf = mystats.EstimatedPdf(data)
    myplots.Pdf(pdf, label="birth weight")
    myplots.Save(root="density_totalwgt_kde", xlabel="lbs", ylabel="PDF")

    df = brfss.ReadBrfss(nrows=None)
    data = df.wtkg2.dropna()
    print("Adult weight")
    mean, median = Summarize(data)

    y = 0.02499
    VertLine(mean, y)
    myplots.Text(mean + 1, 0.1 * y, "mean", horizontalalignment="left")
    VertLine(median, y)
    myplots.Text(median - 1.5, 0.1 * y, "median", horizontalalignment="right")

    pdf = mystats.EstimatedPdf(data)
    myplots.Pdf(pdf, label="adult weight")
    myplots.Save(root="density_wtkg2_kde",
                 xlabel="kg",
                 ylabel="PDF",
                 xlim=[0, 200])
Esempio n. 6
0
def BinnedPercentiles(df):
    cdf = mystats.Cdf(df.htm3)
    print("Fraction between 140 and 200 cm", cdf[200] - cdf[140])

    bins = np.arange(135, 210, 5)
    indices = np.digitize(df.htm3, bins)
    groups = df.groupby(indices)

    heights = [group.htm3.mean() for i, group in groups][1:-1]
    cdfs = [mystats.Cdf(group.wtkg2) for i, group in groups][1:-1]

    myplots.PrePlot(3)
    for percent in [75, 50, 25]:
        weights = [cdf.Percentile(percent) for cdf in cdfs]
        label = "%dth" % percent
        myplots.Plot(heights, weights, label=label)

    myplots.Save(root="scatter3", xlabel="height (cm)", ylabel="weight (kg)")
Esempio n. 7
0
def PrintExtremes(live):
    """Plots the histogram of pregnancy lengths and prints the extremes.

    live: DataFrame of live births
    """
    hist = mystats.Hist(live.prglngth)
    myplots.Hist(hist, label="live births")

    myplots.Save(root="first_nsfg_hist_live",
                 title="Histogram",
                 xlabel="weeks",
                 ylabel="frequency")

    print("Shortest lengths:")
    for weeks, freq in hist.Smallest(10):
        print(weeks, freq)

    print("Longest lengths:")
    for weeks, freq in hist.Largest(10):
        print(weeks, freq)
Esempio n. 8
0
def MakeComparison(firsts, others):
    """Plots histograms of pregnancy length for first babies and others.

    firsts: DataFrame
    others: DataFrame
    """
    first_hist = mystats.Hist(firsts.prglngth, label="first")
    other_hist = mystats.Hist(others.prglngth, label="other")

    width = 0.45
    myplots.PrePlot(2)
    myplots.Hist(first_hist, align="right", width=width)
    myplots.Hist(other_hist, align="left", width=width)

    myplots.Save(
        root="first_nsfg_hist",
        title="Histogram",
        xlabel="weeks",
        ylabel="frequency",
        axis=[27, 46, 0, 2700],
    )
Esempio n. 9
0
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000):
    def VertLine(x, y=1):
        myplots.Plot([x, x], [0, y], color="0.8", linewidth=3)

    means = []
    for _ in range(m):
        xs = np.random.normal(mu, sigma, n)
        xbar = np.mean(xs)
        means.append(xbar)

    stderr = RMSE(means, mu)
    print("standard error", stderr)
    cdf = mystats.Cdf(means)
    ci = cdf.Percentile(5), cdf.Percentile(95)
    print("confidence interval", ci)
    VertLine(ci[0])
    VertLine(ci[1])

    # plot the CDF
    myplots.Cdf(cdf)
    myplots.Save(root="estimation1",
                 xlabel="sample mean",
                 ylabel="CDF",
                 title="Sampling distribution")