def RunSimpleRegression(live): """Runs a simple regression and compare results to thinkstats2 functions. live: DataFrame of live births """ # run the regression with thinkstats2 functions live_dropna = live.dropna(subset=['agepreg', 'totalwgt_lb']) ages = live_dropna.agepreg weights = live_dropna.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) res = thinkstats2.Residuals(ages, weights, inter, slope) r2 = thinkstats2.CoefDetermination(weights, res) # run the regression with statsmodels formula = 'totalwgt_lb ~ agepreg' model = smf.ols(formula, data=live) results = model.fit() SummarizeResults(results) def AlmostEquals(x, y, tol=1e-6): return abs(x - y) < tol assert (AlmostEquals(results.params['Intercept'], inter)) assert (AlmostEquals(results.params['agepreg'], slope)) assert (AlmostEquals(results.rsquared, r2))
def PlotResiduals(live): """Plots percentiles of the residuals. live: DataFrame """ ages = live.agepreg weights = live.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) live['residual'] = thinkstats2.Residuals(ages, weights, inter, slope) bins = np.arange(10, 48, 3) indices = np.digitize(live.agepreg, bins) groups = live.groupby(indices) ages = [group.agepreg.mean() for _, group in groups][1:-1] cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1] thinkplot.PrePlot(3) for percent in [75, 50, 25]: weights = [cdf.Percentile(percent) for cdf in cdfs] label = '%dth' % percent thinkplot.Plot(ages, weights, label=label) thinkplot.Save(root='linear2', xlabel='age (years)', ylabel='residual (lbs)', xlim=[10, 45])
def ComputeCorrelations(heights, weights): """Compute correlations and least squares fit. heights: sequence weights: sequence """ pearson = thinkstats2.Corr(heights, weights) assert almostEquals(pearson, 0.508736478973) print('Pearson correlation (weights):', pearson) log_weights = np.log(weights) log_pearson = thinkstats2.Corr(heights, log_weights) assert almostEquals(log_pearson, 0.531728260598) print('Pearson correlation (log weights):', log_pearson) spearman = thinkstats2.SpearmanCorr(heights, weights) print('Spearman correlation (weights):', spearman) assert almostEquals(spearman, 0.541535836332) inter, slope = thinkstats2.LeastSquares(heights, log_weights) print('Least squares inter, slope (log weights):', inter, slope) res = thinkstats2.Residuals(heights, log_weights, inter, slope) R2 = thinkstats2.CoefDetermination(log_weights, res) R = math.sqrt(R2) print('Coefficient of determination:', R2) print('sqrt(R^2):', R) assert almostEquals(R, log_pearson)
def main(name, data_dir='.'): random.seed(17) xs, ys = ReadData(data_dir) inter = thinkstats2.Mean(ys) slope = 0 fxs, fys = thinkstats2.FitLine(xs, inter, slope) res = thinkstats2.Residuals(xs, ys, inter, slope) inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000) thinkplot.Cdf(slope_cdf) thinkplot.Save(root='regress1', xlabel='Estimated slope (oz/year)', ylabel='CDF', title='Sampling distribution') return inter, slope = thinkstats2.LeastSquares(xs, ys) print 'inter', inter print 'slope', slope fxs, fys = thinkstats2.FitLine(xs, inter, slope) i = len(fxs) / 2 print 'median weight, age', fxs[i], fys[i] res = thinkstats2.Residuals(xs, ys, inter, slope) R2 = thinkstats2.CoefDetermination(ys, res) print 'R2', R2 print 'R', math.sqrt(R2) #thinkplot.Plot(fxs, fys, color='gray', alpha=0.5) #thinkplot.Scatter(xs, ys, alpha=0.05) #thinkplot.Show() inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000) thinkplot.Cdf(slope_cdf) thinkplot.Save(root='regress1', xlabel='Estimated slope (oz/year)', ylabel='CDF', title='Sampling distribution')
def ComputeLeastSquares(ages, weights): """Computes least squares fit for ages and weights. Prints summary statistics. """ # compute the correlation between age and weight print 'Pearson correlation', thinkstats2.Corr(ages, weights) print 'Spearman correlation', thinkstats2.SpearmanCorr(ages, weights) # compute least squares fit inter, slope = thinkstats2.LeastSquares(ages, weights) print '(inter, slope):', inter, slope res = thinkstats2.Residuals(ages, weights, inter, slope) R2 = thinkstats2.CoefDetermination(weights, res) print 'R^2', R2 print return inter, slope, R2
def main(): random.seed(17) rho = 0.8 xs, ys = SatIqData(1000, rho) print 'mean, var of x', thinkstats2.MeanVar(xs) print 'mean, var of y', thinkstats2.MeanVar(ys) print 'Pearson corr', thinkstats2.Corr(xs, ys) inter, slope = thinkstats2.LeastSquares(xs, ys) print 'inter', inter print 'slope', slope fxs, fys = thinkstats2.FitLine(xs, inter, slope) res = thinkstats2.Residuals(xs, ys, inter, slope) R2 = thinkstats2.CoefDetermination(ys, res) print 'R2', R2 thinkplot.Plot(fxs, fys, color='gray', alpha=0.2) thinkplot.Scatter(xs, ys) thinkplot.Show()
def PlotSamplingDistributions(live): """Plots confidence intervals for the fitted curve and sampling dists. live: DataFrame """ ages = live.agepreg weights = live.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) res = thinkstats2.Residuals(ages, weights, inter, slope) r2 = thinkstats2.CoefDetermination(weights, res) print('rho', thinkstats2.Corr(ages, weights)) print('R2', r2) print('R', math.sqrt(r2)) print('Std(ys)', thinkstats2.Std(weights)) print('Std(res)', thinkstats2.Std(res)) # plot the confidence intervals inters, slopes = SamplingDistributions(live, iters=1001) PlotConfidenceIntervals(ages, inters, slopes, percent=90, alpha=0.3, label='90% CI') thinkplot.Text(42, 7.53, '90%') PlotConfidenceIntervals(ages, inters, slopes, percent=50, alpha=0.5, label='50% CI') thinkplot.Text(42, 7.59, '50%') thinkplot.Save(root='linear3', xlabel='age (years)', ylabel='birth weight (lbs)', legend=False) # plot the confidence intervals thinkplot.PrePlot(2) thinkplot.Scatter(ages, weights, color='gray', alpha=0.1) PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2) PlotConfidenceIntervals(ages, inters, slopes) thinkplot.Save(root='linear5', xlabel='age (years)', ylabel='birth weight (lbs)', title='90% CI', axis=[10, 45, 0, 15], legend=False) # plot the sampling distribution of slope under null hypothesis # and alternate hypothesis sampling_cdf = thinkstats2.Cdf(slopes) print('p-value, sampling distribution', sampling_cdf[0]) ht = SlopeTest((ages, weights)) pvalue = ht.PValue() print('p-value, slope test', pvalue) print('inter', inter, thinkstats2.Mean(inters)) Summarize(inters, inter) print('slope', slope, thinkstats2.Mean(slopes)) Summarize(slopes, slope) thinkplot.PrePlot(2) thinkplot.Plot([0, 0], [0, 1], color='0.8') ht.PlotCdf(label='null hypothesis') thinkplot.Cdf(sampling_cdf, label='sampling distribution') thinkplot.Save(root='linear4', xlabel='slope (lbs / year)', ylabel='CDF', xlim=[-0.03, 0.03], loc='upper left')
#%% # Estimate intercept and slope inter, slope = thinkstats2.LeastSquares(heights, logWeight) print("intercept: {:.3f} \n slope: {:.3f}".format(inter, slope)) #%% # show scatter plot of fitted line thinkplot.Scatter(heights, logWeight, alpha=0.01, s=5) fxs, fys = thinkstats2.FitLine(heights, inter, slope) thinkplot.Plot(fxs, fys, color='red') thinkplot.Config(xlabel='Height (cm)', ylabel='log10 weight (kg)') #%% # get the residuals res = thinkstats2.Residuals(heights, logWeight, inter, slope) data['residual'] = res #%% # set up bins, indicies, and groups to calc mean and cdf bins = np.arange(130, 210, 5) indices = np.digitize(data.htm3, bins) groups = data.groupby(indices) means = [group.htm3.mean() for i, group in groups][1:-1] cdfs = [thinkstats2.Cdf(group.residual) for i, group in groups][1:-1] # plot the pencitles for p in [75, 50, 25]: ys = [cdf.Percentile(p) for cdf in cdfs] label = str(p) + 'th'