def RunSimpleRegression(live):
    """Runs a simple regression and compare results to thinkstats2 functions.

    live: DataFrame of live births
    """
    # run the regression with thinkstats2 functions
    live_dropna = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    ages = live_dropna.agepreg
    weights = live_dropna.totalwgt_lb
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    res = thinkstats2.Residuals(ages, weights, inter, slope)
    r2 = thinkstats2.CoefDetermination(weights, res)

    # run the regression with statsmodels
    formula = 'totalwgt_lb ~ agepreg'
    model = smf.ols(formula, data=live)
    results = model.fit()
    SummarizeResults(results)

    def AlmostEquals(x, y, tol=1e-6):
        return abs(x - y) < tol

    assert (AlmostEquals(results.params['Intercept'], inter))
    assert (AlmostEquals(results.params['agepreg'], slope))
    assert (AlmostEquals(results.rsquared, r2))
Ejemplo n.º 2
0
def PlotResiduals(live):
    """Plots percentiles of the residuals.

    live: DataFrame
    """
    ages = live.agepreg
    weights = live.totalwgt_lb
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    live['residual'] = thinkstats2.Residuals(ages, weights, inter, slope)

    bins = np.arange(10, 48, 3)
    indices = np.digitize(live.agepreg, bins)
    groups = live.groupby(indices)

    ages = [group.agepreg.mean() for _, group in groups][1:-1]
    cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1]

    thinkplot.PrePlot(3)
    for percent in [75, 50, 25]:
        weights = [cdf.Percentile(percent) for cdf in cdfs]
        label = '%dth' % percent
        thinkplot.Plot(ages, weights, label=label)

    thinkplot.Save(root='linear2',
                   xlabel='age (years)',
                   ylabel='residual (lbs)',
                   xlim=[10, 45])
Ejemplo n.º 3
0
def ComputeCorrelations(heights, weights):
    """Compute correlations and least squares fit.

    heights: sequence
    weights: sequence
    """
    pearson = thinkstats2.Corr(heights, weights)
    assert almostEquals(pearson, 0.508736478973)
    print('Pearson correlation (weights):', pearson)

    log_weights = np.log(weights)
    log_pearson = thinkstats2.Corr(heights, log_weights)
    assert almostEquals(log_pearson, 0.531728260598)
    print('Pearson correlation (log weights):', log_pearson)

    spearman = thinkstats2.SpearmanCorr(heights, weights)
    print('Spearman correlation (weights):', spearman)
    assert almostEquals(spearman, 0.541535836332)

    inter, slope = thinkstats2.LeastSquares(heights, log_weights)
    print('Least squares inter, slope (log weights):', inter, slope)

    res = thinkstats2.Residuals(heights, log_weights, inter, slope)
    R2 = thinkstats2.CoefDetermination(log_weights, res)
    R = math.sqrt(R2)
    print('Coefficient of determination:', R2)
    print('sqrt(R^2):', R)

    assert almostEquals(R, log_pearson)
Ejemplo n.º 4
0
def main(name, data_dir='.'):
    random.seed(17)

    xs, ys = ReadData(data_dir)
    inter = thinkstats2.Mean(ys)
    slope = 0
    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    res = thinkstats2.Residuals(xs, ys, inter, slope)

    inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000)

    thinkplot.Cdf(slope_cdf)
    thinkplot.Save(root='regress1',
                   xlabel='Estimated slope (oz/year)',
                   ylabel='CDF',
                   title='Sampling distribution')

    return

    inter, slope = thinkstats2.LeastSquares(xs, ys)
    print 'inter', inter
    print 'slope', slope

    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    i = len(fxs) / 2
    print 'median weight, age', fxs[i], fys[i]

    res = thinkstats2.Residuals(xs, ys, inter, slope)
    R2 = thinkstats2.CoefDetermination(ys, res)
    print 'R2', R2
    print 'R', math.sqrt(R2)

    #thinkplot.Plot(fxs, fys, color='gray', alpha=0.5)
    #thinkplot.Scatter(xs, ys, alpha=0.05)
    #thinkplot.Show()

    inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000)
    thinkplot.Cdf(slope_cdf)
    thinkplot.Save(root='regress1',
                   xlabel='Estimated slope (oz/year)',
                   ylabel='CDF',
                   title='Sampling distribution')
Ejemplo n.º 5
0
def ComputeLeastSquares(ages, weights):
    """Computes least squares fit for ages and weights.

    Prints summary statistics.
    """
    # compute the correlation between age and weight
    print 'Pearson correlation', thinkstats2.Corr(ages, weights)
    print 'Spearman correlation', thinkstats2.SpearmanCorr(ages, weights)

    # compute least squares fit
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    print '(inter, slope):', inter, slope

    res = thinkstats2.Residuals(ages, weights, inter, slope)
    R2 = thinkstats2.CoefDetermination(weights, res)

    print 'R^2', R2
    print
    return inter, slope, R2
Ejemplo n.º 6
0
def main():
    random.seed(17)

    rho = 0.8
    xs, ys = SatIqData(1000, rho)
    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)

    inter, slope = thinkstats2.LeastSquares(xs, ys)
    print 'inter', inter
    print 'slope', slope

    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    res = thinkstats2.Residuals(xs, ys, inter, slope)
    R2 = thinkstats2.CoefDetermination(ys, res)
    print 'R2', R2

    thinkplot.Plot(fxs, fys, color='gray', alpha=0.2)
    thinkplot.Scatter(xs, ys)
    thinkplot.Show()
Ejemplo n.º 7
0
def PlotSamplingDistributions(live):
    """Plots confidence intervals for the fitted curve and sampling dists.

    live: DataFrame
    """
    ages = live.agepreg
    weights = live.totalwgt_lb
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    res = thinkstats2.Residuals(ages, weights, inter, slope)
    r2 = thinkstats2.CoefDetermination(weights, res)

    print('rho', thinkstats2.Corr(ages, weights))
    print('R2', r2)
    print('R', math.sqrt(r2))
    print('Std(ys)', thinkstats2.Std(weights))
    print('Std(res)', thinkstats2.Std(res))

    # plot the confidence intervals
    inters, slopes = SamplingDistributions(live, iters=1001)
    PlotConfidenceIntervals(ages,
                            inters,
                            slopes,
                            percent=90,
                            alpha=0.3,
                            label='90% CI')
    thinkplot.Text(42, 7.53, '90%')
    PlotConfidenceIntervals(ages,
                            inters,
                            slopes,
                            percent=50,
                            alpha=0.5,
                            label='50% CI')
    thinkplot.Text(42, 7.59, '50%')

    thinkplot.Save(root='linear3',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   legend=False)

    # plot the confidence intervals
    thinkplot.PrePlot(2)
    thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
    PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2)
    PlotConfidenceIntervals(ages, inters, slopes)
    thinkplot.Save(root='linear5',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   title='90% CI',
                   axis=[10, 45, 0, 15],
                   legend=False)

    # plot the sampling distribution of slope under null hypothesis
    # and alternate hypothesis
    sampling_cdf = thinkstats2.Cdf(slopes)
    print('p-value, sampling distribution', sampling_cdf[0])

    ht = SlopeTest((ages, weights))
    pvalue = ht.PValue()
    print('p-value, slope test', pvalue)

    print('inter', inter, thinkstats2.Mean(inters))
    Summarize(inters, inter)
    print('slope', slope, thinkstats2.Mean(slopes))
    Summarize(slopes, slope)

    thinkplot.PrePlot(2)
    thinkplot.Plot([0, 0], [0, 1], color='0.8')
    ht.PlotCdf(label='null hypothesis')
    thinkplot.Cdf(sampling_cdf, label='sampling distribution')
    thinkplot.Save(root='linear4',
                   xlabel='slope (lbs / year)',
                   ylabel='CDF',
                   xlim=[-0.03, 0.03],
                   loc='upper left')
#%%
# Estimate intercept and slope
inter, slope = thinkstats2.LeastSquares(heights, logWeight)

print("intercept: {:.3f} \n slope: {:.3f}".format(inter, slope))

#%%
# show scatter plot of fitted line
thinkplot.Scatter(heights, logWeight, alpha=0.01, s=5)
fxs, fys = thinkstats2.FitLine(heights, inter, slope)
thinkplot.Plot(fxs, fys, color='red')
thinkplot.Config(xlabel='Height (cm)', ylabel='log10 weight (kg)')

#%%
# get the residuals
res = thinkstats2.Residuals(heights, logWeight, inter, slope)
data['residual'] = res

#%%
# set up bins, indicies, and groups to calc mean and cdf
bins = np.arange(130, 210, 5)
indices = np.digitize(data.htm3, bins)
groups = data.groupby(indices)

means = [group.htm3.mean() for i, group in groups][1:-1]
cdfs = [thinkstats2.Cdf(group.residual) for i, group in groups][1:-1]

# plot the pencitles
for p in [75, 50, 25]:
    ys = [cdf.Percentile(p) for cdf in cdfs]
    label = str(p) + 'th'