def Corr(xs, ys): xs = np.asarray(xs) ys = np.asarray(ys) meanx, varx = thinkstats2.MeanVar(xs) meany, vary = thinkstats2.MeanVar(ys) corr = Cov(xs, ys, meanx, meany) / np.sqrt(varx * vary) return corr
def MakeErrorModel(model, ys, ts, n=100): """Makes a model that captures sample error and residual error. model: string representation of the regression model ys: dependent variable ts: explanatory variable n: number of simulations to run Returns a pair of models, where each model is a pair of rows. """ # estimate mean and stddev of the residuals residuals = Residuals(model, ys, ts) mu, var = thinkstats2.MeanVar(residuals) sig = math.sqrt(var) # make the best fit fts, fys = MakeFit(model, ys, ts) # resample residuals and generate hypothetical fits fits = [] for i in range(n): fake_ys = [fy + random.gauss(mu, sig) for fy in fys[:-1]] _, fake_fys = MakeFit(model, fake_ys, ts) fits.append(fake_fys) # find the 90% CI in each column columns = zip(*fits) sample_error = MakeStderr(columns) total_error = MakeStderr(columns, mu, var) return fts, sample_error, total_error
def main(name, data_dir=''): pool, firsts, others = MakeTables(data_dir) for table in [pool, firsts, others]: print table.name, len(table.records), print len(table.ages), len(table.weights) # compute differences in mean age and weight age_diff = DifferenceInMeans(firsts, others, 'ages') weight_diff = DifferenceInMeans(firsts, others, 'weights') # get ages and weights ages, weights = GetAgeWeight(pool) print 'Mean, var weight', thinkstats2.MeanVar(weights) # compute a least squares fit inter, slope, R2 = ComputeLeastSquares(ages, weights) # see how much of the weight difference is explained by age weight_diff_explained = age_diff * slope print 'Weight difference explained by age:', weight_diff_explained print 'Fraction explained:', weight_diff_explained / weight_diff print # make a table of mean weight for 5-year age bins weight_dict = Partition(ages, weights) MakeLinePlot(weight_dict) # the correlations are slightly higher if we trim outliers ages, weights = GetAgeWeight(pool, low=4, high=12) inter, slope, R2 = ComputeLeastSquares(ages, weights) MakeFigures(pool, firsts, others)
def MakeStderr(columns, mu2=0, var2=0): """Finds a confidence interval for each column. Returns two rows: the low end of the intervals and the high ends. """ stats = [thinkstats2.MeanVar(ys) for ys in columns] min_fys = [mu1 + mu2 - 2 * math.sqrt(var1 + var2) for mu1, var1 in stats] max_fys = [mu1 + mu2 + 2 * math.sqrt(var1 + var2) for mu1, var1 in stats] return min_fys, max_fys
def main(): random.seed(17) rho = -0.8 res = CorrelatedGenerator(1000, rho) xs, ys = zip(*res) a = 1.0 b = 0.0 xs = [a * x + b for x in xs] print 'mean, var of x', thinkstats2.MeanVar(xs) print 'mean, var of y', thinkstats2.MeanVar(ys) print 'covariance', thinkstats2.Cov(xs, ys) print 'Pearson corr', thinkstats2.Corr(xs, ys) print 'Spearman corr', thinkstats2.SpearmanCorr(xs, ys) thinkplot.Scatter(xs, ys) thinkplot.Show()
def main(): random.seed(17) rho = 0.8 xs, ys = SatIqData(1000, rho) print 'mean, var of x', thinkstats2.MeanVar(xs) print 'mean, var of y', thinkstats2.MeanVar(ys) print 'Pearson corr', thinkstats2.Corr(xs, ys) inter, slope = thinkstats2.LeastSquares(xs, ys) print 'inter', inter print 'slope', slope fxs, fys = thinkstats2.FitLine(xs, inter, slope) res = thinkstats2.Residuals(xs, ys, inter, slope) R2 = thinkstats2.CoefDetermination(ys, res) print 'R2', R2 thinkplot.Plot(fxs, fys, color='gray', alpha=0.2) thinkplot.Scatter(xs, ys) thinkplot.Show()
def main(): thinkstats2.RandomSeed(17) # get the data live, firsts, others = first.MakeFrames() mean_var = thinkstats2.MeanVar(live.prglngth) print('(Mean, Var) of prglength for live births', mean_var) data = firsts.prglngth.values, others.prglngth.values # test the difference in means ht = DiffMeansPermute(data) p_value = ht.PValue(iters=1000) print('p-value =', p_value) ht.PlotCdf() thinkplot.Save(root='hypothesis1', title='Permutation test', xlabel='difference in means (weeks)', ylabel='CDF', legend=False) # test the difference in std ht = DiffStdPermute(data) p_value = ht.PValue(iters=1000) print('p-value =', p_value) ht.PlotCdf() thinkplot.Save(root='hypothesis2', title='Permutation test', xlabel='difference in std (weeks)', ylabel='CDF', legend=False) # test the difference in means by resampling ht = DiffStdPermute(data) p_value = ht.PValue(iters=1000) print('p-value =', p_value) ht.PlotCdf() thinkplot.Save(root='hypothesis3', title='Resampling test', xlabel='difference in means (weeks)', ylabel='CDF', legend=False)
xs = np.asarray(xs) ys = np.asarray(ys) if meanx is None: meanx = np.mean(xs) if meany is None: meany = np.mean(ys) cov = np.dot(xs-meanx, ys-meany) / len(xs) return cov def Corr(xs, ys): # Pearson's xs = np.asarray(xs) ys = np.asarray(ys) meanx, varx = thinkstats2.MeanVar(xs) meany, vary = thinkstats2.MeanVar(ys) corr = Cov(xs, ys, meanx, meany) / np.sqrt(varx * vary) return corr import pandas as pd def SpearmanCorr(xs, ys): # Spearmans' xranks = pd.Series(xs).rank() yranks = pd.Series(ys).rank() return Corr(xranks, yranks) print('Corr', Corr(ages, weights)) print('SpearmanCorr', SpearmanCorr(ages, weights)) # RESULTS: Corr 0.0688339703541
def testMeanVar(self): t = [1, 1, 1, 3, 3, 591] mean, var = thinkstats2.MeanVar(t) self.assertAlmostEqual(mean, 100.0) self.assertAlmostEqual(var, 48217.0)