def RunSimpleRegression(live): """Runs a simple regression and compare results to thinkstats2 functions. live: DataFrame of live births """ # run the regression with thinkstats2 functions live_dropna = live.dropna(subset=['agepreg', 'totalwgt_lb']) ages = live_dropna.agepreg weights = live_dropna.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) res = thinkstats2.Residuals(ages, weights, inter, slope) r2 = thinkstats2.CoefDetermination(weights, res) # run the regression with statsmodels formula = 'totalwgt_lb ~ agepreg' model = smf.ols(formula, data=live) results = model.fit() SummarizeResults(results) def AlmostEquals(x, y, tol=1e-6): return abs(x - y) < tol assert (AlmostEquals(results.params['Intercept'], inter)) assert (AlmostEquals(results.params['agepreg'], slope)) assert (AlmostEquals(results.rsquared, r2))
def PlotResiduals(live): """Plots percentiles of the residuals. live: DataFrame """ ages = live.agepreg weights = live.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) live['residual'] = thinkstats2.Residuals(ages, weights, inter, slope) bins = np.arange(10, 48, 3) indices = np.digitize(live.agepreg, bins) groups = live.groupby(indices) ages = [group.agepreg.mean() for _, group in groups][1:-1] cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1] thinkplot.PrePlot(3) for percent in [75, 50, 25]: weights = [cdf.Percentile(percent) for cdf in cdfs] label = '%dth' % percent thinkplot.Plot(ages, weights, label=label) thinkplot.Save(root='linear2', xlabel='age (years)', ylabel='residual (lbs)', xlim=[10, 45])
def ComputeCorrelations(heights, weights): """Compute correlations and least squares fit. heights: sequence weights: sequence """ pearson = thinkstats2.Corr(heights, weights) assert almostEquals(pearson, 0.508736478973) print('Pearson correlation (weights):', pearson) log_weights = np.log(weights) log_pearson = thinkstats2.Corr(heights, log_weights) assert almostEquals(log_pearson, 0.531728260598) print('Pearson correlation (log weights):', log_pearson) spearman = thinkstats2.SpearmanCorr(heights, weights) print('Spearman correlation (weights):', spearman) assert almostEquals(spearman, 0.541535836332) inter, slope = thinkstats2.LeastSquares(heights, log_weights) print('Least squares inter, slope (log weights):', inter, slope) res = thinkstats2.Residuals(heights, log_weights, inter, slope) R2 = thinkstats2.CoefDetermination(log_weights, res) R = math.sqrt(R2) print('Coefficient of determination:', R2) print('sqrt(R^2):', R) assert almostEquals(R, log_pearson)
def PlotArrivalDepartureDelayFit(flights): """Plots a scatter plot and fitted curve. live: DataFrame """ sample = thinkstats2.SampleRows(flights, 1000) arrivalDelays = sample.ARRIVAL_DELAY departureDelays = sample.DEPARTURE_DELAY inter, slope = thinkstats2.LeastSquares(arrivalDelays, departureDelays) fit_xs, fit_ys = thinkstats2.FitLine(arrivalDelays, inter, slope) thinkplot.Scatter(arrivalDelays, departureDelays, color='gray', alpha=0.1) thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3) thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2) thinkplot.Save( root='ArrivalDepartureDelayFit_linear1', xlabel='arrival delay (min)', ylabel='departure delay (min)', # axis=[10, 45, 0, 15], legend=False) formula = 'DEPARTURE_DELAY ~ ARRIVAL_DELAY' model = smf.ols(formula, data=sample) results = model.fit() regression.SummarizeResults(results)
def TestStatistic(self, data): """Computes the test statistic. data: data in whatever form is relevant """ ages, weights = data _, slope = thinkstats2.LeastSquares(ages, weights) return slope
def SamplingDistributions(dados_chuva, dados_vazao, iters=101): dados = pd.DataFrame([dados_chuva, dados_vazao]) dados = dados.T t = [] for _ in range(iters): sample = thinkstats2.ResampleRows(dados) chuva = sample["COIMBRA_P"] vazao = sample["COIMBRA_F"] estimates = thinkstats2.LeastSquares(chuva, vazao) t.append(estimates) inters, slopes = zip(*t) return inters, slopes
def SamplingDistributions(live, iters=101): """Estimates sampling distributions by resampling rows. live: DataFrame iters: number of times to run simulations returns: pair of sequences (inters, slopes) """ t = [] for _ in range(iters): sample = thinkstats2.ResampleRows(live) ages = sample.agepreg weights = sample.totalwgt_lb estimates = thinkstats2.LeastSquares(ages, weights) t.append(estimates) inters, slopes = zip(*t) return inters, slopes
def PlotFit(live): """Plots a scatter plot and fitted curve. live: DataFrame """ ages = live.agepreg weights = live.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) fit_xs, fit_ys = thinkstats2.FitLine(ages, inter, slope) thinkplot.Scatter(ages, weights, color='gray', alpha=0.1) thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3) thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2) thinkplot.Save(root='linear1', xlabel='age (years)', ylabel='birth weight (lbs)', axis=[10, 45, 0, 15], legend=False)
def ComputeLeastSquares(ages, weights): """Computes least squares fit for ages and weights. Prints summary statistics. """ # compute the correlation between age and weight print 'Pearson correlation', thinkstats2.Corr(ages, weights) print 'Spearman correlation', thinkstats2.SpearmanCorr(ages, weights) # compute least squares fit inter, slope = thinkstats2.LeastSquares(ages, weights) print '(inter, slope):', inter, slope res = thinkstats2.Residuals(ages, weights, inter, slope) R2 = thinkstats2.CoefDetermination(weights, res) print 'R^2', R2 print return inter, slope, R2
def main(name, data_dir='.'): random.seed(17) xs, ys = ReadData(data_dir) inter = thinkstats2.Mean(ys) slope = 0 fxs, fys = thinkstats2.FitLine(xs, inter, slope) res = thinkstats2.Residuals(xs, ys, inter, slope) inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000) thinkplot.Cdf(slope_cdf) thinkplot.Save(root='regress1', xlabel='Estimated slope (oz/year)', ylabel='CDF', title='Sampling distribution') return inter, slope = thinkstats2.LeastSquares(xs, ys) print 'inter', inter print 'slope', slope fxs, fys = thinkstats2.FitLine(xs, inter, slope) i = len(fxs) / 2 print 'median weight, age', fxs[i], fys[i] res = thinkstats2.Residuals(xs, ys, inter, slope) R2 = thinkstats2.CoefDetermination(ys, res) print 'R2', R2 print 'R', math.sqrt(R2) #thinkplot.Plot(fxs, fys, color='gray', alpha=0.5) #thinkplot.Scatter(xs, ys, alpha=0.05) #thinkplot.Show() inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000) thinkplot.Cdf(slope_cdf) thinkplot.Save(root='regress1', xlabel='Estimated slope (oz/year)', ylabel='CDF', title='Sampling distribution')
def main(): random.seed(17) rho = 0.8 xs, ys = SatIqData(1000, rho) print 'mean, var of x', thinkstats2.MeanVar(xs) print 'mean, var of y', thinkstats2.MeanVar(ys) print 'Pearson corr', thinkstats2.Corr(xs, ys) inter, slope = thinkstats2.LeastSquares(xs, ys) print 'inter', inter print 'slope', slope fxs, fys = thinkstats2.FitLine(xs, inter, slope) res = thinkstats2.Residuals(xs, ys, inter, slope) R2 = thinkstats2.CoefDetermination(ys, res) print 'R2', R2 thinkplot.Plot(fxs, fys, color='gray', alpha=0.2) thinkplot.Scatter(xs, ys) thinkplot.Show()
def PlotSamplingDistributions(live): """Plots confidence intervals for the fitted curve and sampling dists. live: DataFrame """ ages = live.agepreg weights = live.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) res = thinkstats2.Residuals(ages, weights, inter, slope) r2 = thinkstats2.CoefDetermination(weights, res) print('rho', thinkstats2.Corr(ages, weights)) print('R2', r2) print('R', math.sqrt(r2)) print('Std(ys)', thinkstats2.Std(weights)) print('Std(res)', thinkstats2.Std(res)) # plot the confidence intervals inters, slopes = SamplingDistributions(live, iters=1001) PlotConfidenceIntervals(ages, inters, slopes, percent=90, alpha=0.3, label='90% CI') thinkplot.Text(42, 7.53, '90%') PlotConfidenceIntervals(ages, inters, slopes, percent=50, alpha=0.5, label='50% CI') thinkplot.Text(42, 7.59, '50%') thinkplot.Save(root='linear3', xlabel='age (years)', ylabel='birth weight (lbs)', legend=False) # plot the confidence intervals thinkplot.PrePlot(2) thinkplot.Scatter(ages, weights, color='gray', alpha=0.1) PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2) PlotConfidenceIntervals(ages, inters, slopes) thinkplot.Save(root='linear5', xlabel='age (years)', ylabel='birth weight (lbs)', title='90% CI', axis=[10, 45, 0, 15], legend=False) # plot the sampling distribution of slope under null hypothesis # and alternate hypothesis sampling_cdf = thinkstats2.Cdf(slopes) print('p-value, sampling distribution', sampling_cdf[0]) ht = SlopeTest((ages, weights)) pvalue = ht.PValue() print('p-value, slope test', pvalue) print('inter', inter, thinkstats2.Mean(inters)) Summarize(inters, inter) print('slope', slope, thinkstats2.Mean(slopes)) Summarize(slopes, slope) thinkplot.PrePlot(2) thinkplot.Plot([0, 0], [0, 1], color='0.8') ht.PlotCdf(label='null hypothesis') thinkplot.Cdf(sampling_cdf, label='sampling distribution') thinkplot.Save(root='linear4', xlabel='slope (lbs / year)', ylabel='CDF', xlim=[-0.03, 0.03], loc='upper left')
# <br> <br> Like the NSFG, the BRFSS oversamples some groups and provides a sampling weight for each respondent. In the BRFSS data, the variable name for these weights is totalwt. Use resampling, with and without weights, to estimate the mean height of respondents in the BRFSS, the standard error of the mean, and a 90% confidence interval. How much does correct weighting affect the estimates? #%% # read in the brfss data data = brfss.ReadBrfss(nrows=None) data = data.dropna(subset=['htm3', 'wtkg2']) weights = data.wtkg2 heights = data.htm3 # get log weight logWeight = np.log10(weights) #%% # Estimate intercept and slope inter, slope = thinkstats2.LeastSquares(heights, logWeight) print("intercept: {:.3f} \n slope: {:.3f}".format(inter, slope)) #%% # show scatter plot of fitted line thinkplot.Scatter(heights, logWeight, alpha=0.01, s=5) fxs, fys = thinkstats2.FitLine(heights, inter, slope) thinkplot.Plot(fxs, fys, color='red') thinkplot.Config(xlabel='Height (cm)', ylabel='log10 weight (kg)') #%% # get the residuals res = thinkstats2.Residuals(heights, logWeight, inter, slope) data['residual'] = res
def Permute(fxs, fys, res): random.shuffle(res) inter, slope = thinkstats2.LeastSquares(fxs, fys + res) return inter, slope
def TestStatistic(self, data): chuva, vazao = data _, slope = thinkstats2.LeastSquares(chuva, vazao) return slope
def TestStatistic(self, data): ages, weights = data _, slope = thinkstats2.LeastSquares(ages, weights) return slope