Beispiel #1
0
def MakeNormalPlot(weights):
    mean = weights.mean()
    std = weights.std()

    xs = [-4, 4]
    fxs, fys = thinkstats2.FitLine(xs, inter=mean, slope=std)
    return fxs, fys
Beispiel #2
0
def PlotConfidenceIntervals(xs,
                            inters,
                            slopes,
                            res=None,
                            percent=90,
                            **options):
    """Plots the 90% confidence intervals for weights based on ages.

    xs: sequence
    inters: estimated intercepts
    slopes: estimated slopes
    res: residuals
    percent: what percentile range to show
    """
    fys_seq = []
    for inter, slope in zip(inters, slopes):
        fxs, fys = thinkstats2.FitLine(xs, inter, slope)
        if res is not None:
            fys += np.random.permutation(res)
        fys_seq.append(fys)

    p = (100 - percent) / 2
    percents = p, 100 - p
    low, high = thinkstats2.PercentileRows(fys_seq, percents)
    thinkplot.FillBetween(fxs, low, high, **options)
Beispiel #3
0
def PlotArrivalDepartureDelayFit(flights):
    """Plots a scatter plot and fitted curve.

    live: DataFrame
    """

    sample = thinkstats2.SampleRows(flights, 1000)
    arrivalDelays = sample.ARRIVAL_DELAY
    departureDelays = sample.DEPARTURE_DELAY
    inter, slope = thinkstats2.LeastSquares(arrivalDelays, departureDelays)
    fit_xs, fit_ys = thinkstats2.FitLine(arrivalDelays, inter, slope)

    thinkplot.Scatter(arrivalDelays, departureDelays, color='gray', alpha=0.1)
    thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3)
    thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2)
    thinkplot.Save(
        root='ArrivalDepartureDelayFit_linear1',
        xlabel='arrival delay (min)',
        ylabel='departure delay (min)',
        #                   axis=[10, 45, 0, 15],
        legend=False)

    formula = 'DEPARTURE_DELAY ~ ARRIVAL_DELAY'
    model = smf.ols(formula, data=sample)
    results = model.fit()
    regression.SummarizeResults(results)
def PlotConfidenceIntervals(xs, inters, slopes, percent=90, **options):
    fys_seq = []
    for inter, slope in zip(inters, slopes):
        fxs, fys = thinkstats2.FitLine(xs, inter, slope)
        fys_seq.append(fys)

    p = (100 - percent) / 2
    percents = p, 100 - p
    low, high = thinkstats2.PercentileRows(fys_seq, percents)
    thinkplot.FillBetween(fxs, low, high, **options)
Beispiel #5
0
def main(name, data_dir='.'):
    random.seed(17)

    xs, ys = ReadData(data_dir)
    inter = thinkstats2.Mean(ys)
    slope = 0
    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    res = thinkstats2.Residuals(xs, ys, inter, slope)

    inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000)

    thinkplot.Cdf(slope_cdf)
    thinkplot.Save(root='regress1',
                   xlabel='Estimated slope (oz/year)',
                   ylabel='CDF',
                   title='Sampling distribution')

    return

    inter, slope = thinkstats2.LeastSquares(xs, ys)
    print 'inter', inter
    print 'slope', slope

    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    i = len(fxs) / 2
    print 'median weight, age', fxs[i], fys[i]

    res = thinkstats2.Residuals(xs, ys, inter, slope)
    R2 = thinkstats2.CoefDetermination(ys, res)
    print 'R2', R2
    print 'R', math.sqrt(R2)

    #thinkplot.Plot(fxs, fys, color='gray', alpha=0.5)
    #thinkplot.Scatter(xs, ys, alpha=0.05)
    #thinkplot.Show()

    inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000)
    thinkplot.Cdf(slope_cdf)
    thinkplot.Save(root='regress1',
                   xlabel='Estimated slope (oz/year)',
                   ylabel='CDF',
                   title='Sampling distribution')
Beispiel #6
0
def PlotNormalProbability(sample, title="", ylabel=""):
    mu, var = thinkstats2.TrimmedMeanVar(sample, p=0.01)
    sigma = np.sqrt(var)
    xs = [-5, 5]
    fxs, fys = thinkstats2.FitLine(xs, inter=mu, slope=sigma)
    thinkplot.plot(fxs,
                   fys,
                   color='gray',
                   label=r'model $\mu$={:.2f} $\sigma$={:.2f}'.format(
                       mu, sigma))
    xs, ys = thinkstats2.NormalProbability(sample)
    thinkplot.Plot(xs, ys, label="actual")
    thinkplot.Config(title=title, xlabel="z", ylabel=ylabel)
Beispiel #7
0
def MakeNormalPlot(weights):
    """Generates a normal probability plot of birth weights.

    weights: sequence
    """
    mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
    std = math.sqrt(var)

    xs = [-5, 5]
    xs, ys = thinkstats2.FitLine(xs, mean, std)
    thinkplot.Plot(xs, ys, color='0.8', label='model')

    xs, ys = thinkstats2.NormalProbability(weights)
    thinkplot.Plot(xs, ys, label='weights')
def MakeNormalPlot(x):
    """Generates a normal probability plot of birth weights."""

    mean, var = thinkstats2.TrimmedMeanVar(df[x], p=0.01)
    std = math.sqrt(var)

    xs = [-4, 4]
    fxs, fys = thinkstats2.FitLine(xs, mean, std)
    thinkplot.Plot(fxs, fys, linewidth=4, color='0.8')

    thinkplot.PrePlot(2)
    xs, ys = thinkstats2.NormalProbability(df[x])
    thinkplot.Plot(xs, ys, label='Number of Crimes')
    thinkplot.Show(title='Normal Prob Plot: {}'.format(x),
                   xlabel='Standard deviations from mean',
                   ylabel='Number of Crimes')
Beispiel #9
0
def PlotFit(live):
    """Plots a scatter plot and fitted curve.

    live: DataFrame
    """
    ages = live.agepreg
    weights = live.totalwgt_lb
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    fit_xs, fit_ys = thinkstats2.FitLine(ages, inter, slope)

    thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
    thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3)
    thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2)
    thinkplot.Save(root='linear1',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   axis=[10, 45, 0, 15],
                   legend=False)
Beispiel #10
0
def MakeNormalPlot(arrivalDelays):
    """Generate the normal probability plot for the arrival delays.
       This is a modified copy from analytic.py
    """

    mean = arrivalDelays.mean()
    std = arrivalDelays.std()

    xs = [-4, 4]
    fxs, fys = thinkstats2.FitLine(xs, mean, std)
    thinkplot.Plot(fxs, fys, linewidth=4, color='0.8')

    thinkplot.PrePlot(2)
    xs, ys = thinkstats2.NormalProbability(arrivalDelays)
    thinkplot.Plot(xs, ys, label='arrival delays (min)')

    thinkplot.Save(root='NormalModel_arrivaldelay_normalplot',
                   title='Normal probability plot',
                   xlabel='Standard deviations from mean',
                   ylabel='Arrival Delays (min)')
def MakeNormalPlot(weights, term_weights):
    """Generates a normal probability plot of birth weights."""

    mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
    std = math.sqrt(var)

    xs = [-4, 4]
    fxs, fys = thinkstats2.FitLine(xs, mean, std)
    thinkplot.Plot(fxs, fys, linewidth=4, color='0.8')

    thinkplot.PrePlot(2)
    xs, ys = thinkstats2.NormalProbability(weights)
    thinkplot.Plot(xs, ys, label='all live')

    xs, ys = thinkstats2.NormalProbability(term_weights)
    thinkplot.Plot(xs, ys, label='full term')
    thinkplot.Save(root='analytic_birthwgt_normal',
                   title='Normal probability plot',
                   xlabel='Standard deviations from mean',
                   ylabel='Birth weight (lbs)')
Beispiel #12
0
def PlotConfidenceIntervals(xs,
                            inters,
                            slopes,
                            res=None,
                            percent=90,
                            **options):
    """Plots the 90% confidence intervals for weights based on ages.

    xs: sequence
    inters: estimated intercepts
    slopes: estimated slopes
    res: residuals
    percent: what percentile range to show
    """
    size = len(slopes), len(xs)
    array = np.zeros(size)

    for i, (inter, slope) in enumerate(zip(inters, slopes)):
        fxs, fys = thinkstats2.FitLine(xs, inter, slope)
        if res is not None:
            fys += np.random.permutation(res)
        array[i, ] = fys

    array = np.sort(array, axis=0)

    def Percentile(p):
        """Selects the line from array that corresponds to percentile p.

        p: float 0--100

        returns: NumPy array (one row)
        """
        index = int(len(slopes) * p / 100)
        return array[index, ]

    p = (100 - percent) / 2
    #low = thinkstats2.Smooth(Percentile(p))
    #high = thinkstats2.Smooth(Percentile(100-p))
    low = Percentile(p)
    high = Percentile(100 - p)
    thinkplot.FillBetween(fxs, low, high, **options)
Beispiel #13
0
def main():
    random.seed(17)

    rho = 0.8
    xs, ys = SatIqData(1000, rho)
    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)

    inter, slope = thinkstats2.LeastSquares(xs, ys)
    print 'inter', inter
    print 'slope', slope

    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    res = thinkstats2.Residuals(xs, ys, inter, slope)
    R2 = thinkstats2.CoefDetermination(ys, res)
    print 'R2', R2

    thinkplot.Plot(fxs, fys, color='gray', alpha=0.2)
    thinkplot.Scatter(xs, ys)
    thinkplot.Show()
Beispiel #14
0
    label = '$\mu=%d$, $\sigma=%d$' % (mu, sigma)
    thinkplot.Plot(xs, ys, label=label)

thinkplot.Config(title='Normal probability plot',
                 xlabel='standard normal sample',
                 ylabel='sample values')

#%% [markdown]
# Here's the normal probability plot for birth weights, showing that the lightest babies are lighter than we expect from the normal mode, and the heaviest babies are heavier.

#%%
mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
std = np.sqrt(var)

xs = [-4, 4]
fxs, fys = thinkstats2.FitLine(xs, mean, std)
thinkplot.Plot(fxs, fys, linewidth=4, color='0.8')

xs, ys = thinkstats2.NormalProbability(weights)
thinkplot.Plot(xs, ys, label='all live')

thinkplot.Config(title='Normal probability plot',
                 xlabel='Standard deviations from mean',
                 ylabel='Birth weight (lbs)')

#%% [markdown]
# If we suspect that the deviation in the left tail is due to preterm babies, we can check by selecting only full term births.

#%%
full_term = preg[preg.prglngth >= 37]
term_weights = full_term.totalwgt_lb.dropna()
Beispiel #15
0
n = 1000
thinkplot.PrePlot(3)
for mu, sigma in zip([0, 1, 5], [1, 1, 2]):
    sample = np.random.normal(mu, sigma, n)
    xs, ys = thinkstats2.NormalProbability(sample)
    thinkplot.plot(xs, ys, label=r"$\mu$={} $\sigma$={}".format(mu, sigma))
thinkplot.Config(title="Normal probability plot",
                 xlabel="standard normal sample",
                 ylabel="sample value")

#%%
mu, var = thinkstats2.TrimmedMeanVar(totalwgt_lb, p=0.01)
maturity = live[live.prglngth >= 37].totalwgt_lb.dropna()
sigma = np.sqrt(var)
xs = [-4, 4]
fxs, fys = thinkstats2.FitLine(xs, inter=mu, slope=sigma)
thinkplot.plot(fxs,
               fys,
               color='gray',
               label=r'model $\mu$={:.2f} $\sigma$={:.2f}'.format(mu, sigma))
xs, ys = thinkstats2.NormalProbability(totalwgt_lb)
thinkplot.Plot(xs, ys, label="all")
xs, ys = thinkstats2.NormalProbability(maturity)
thinkplot.Plot(xs, ys, label="maturity")
thinkplot.Config()

#%% [markdown]
# ## Lognormal distribution

#%%
import brfss
Beispiel #16
0
width = 0.45
thinkplot.PrePlot(2)
thinkplot.Hist(male_pmf, width=width, align='left', color='blue')
thinkplot.Hist(female_pmf, width=width, align='right', color='red')
thinkplot.Config(ylabel='Probability')

#plot CDF
cdf = thinkstats2.Cdf(df.Age)
thinkplot.Cdf(cdf)
thinkplot.Config(xlabel='Age', ylabel='CDF')

#plot normal distribution
mean = df.Age.mean()
std = df.Age.std()
xs = [-4, 4]
fxs, fys = thinkstats2.FitLine(xs, inter=mean, slope=std)
thinkplot.Plot(fxs, fys, color='gray', label='model')
xs, ys = thinkstats2.NormalProbability(df.Age)
thinkplot.Plot(xs, ys, label='Age')

#scatter plots and correlation
#year vs. age
year = thinkstats2.Jitter(df.Year, .25)
thinkplot.Scatter(year, df.Age)
thinkplot.Show(xlabel='Year', ylabel='Age')
thinkstats2.Corr(df.Year, df.Age)
#drug vs. age
thinkplot.Scatter(df.Age, df.Drug)
thinkplot.Show(xlabel='Age', ylabel='Drug')

#testing a difference in gender
weights = data.wtkg2
heights = data.htm3

# get log weight
logWeight = np.log10(weights)

#%%
# Estimate intercept and slope
inter, slope = thinkstats2.LeastSquares(heights, logWeight)

print("intercept: {:.3f} \n slope: {:.3f}".format(inter, slope))

#%%
# show scatter plot of fitted line
thinkplot.Scatter(heights, logWeight, alpha=0.01, s=5)
fxs, fys = thinkstats2.FitLine(heights, inter, slope)
thinkplot.Plot(fxs, fys, color='red')
thinkplot.Config(xlabel='Height (cm)', ylabel='log10 weight (kg)')

#%%
# get the residuals
res = thinkstats2.Residuals(heights, logWeight, inter, slope)
data['residual'] = res

#%%
# set up bins, indicies, and groups to calc mean and cdf
bins = np.arange(130, 210, 5)
indices = np.digitize(data.htm3, bins)
groups = data.groupby(indices)

means = [group.htm3.mean() for i, group in groups][1:-1]