def Corr(xs, ys):
    xs = np.asarray(xs)
    ys = np.asarray(ys)

    meanx, varx = thinkstats2.MeanVar(xs)
    meany, vary = thinkstats2.MeanVar(ys)

    corr = Cov(xs, ys, meanx, meany) / np.sqrt(varx * vary)
    return corr
Esempio n. 2
0
def MakeErrorModel(model, ys, ts, n=100):
    """Makes a model that captures sample error and residual error.

    model: string representation of the regression model
    ys:    dependent variable
    ts:    explanatory variable
    n:     number of simulations to run

    Returns a pair of models, where each model is a pair of rows.
    """
    # estimate mean and stddev of the residuals
    residuals = Residuals(model, ys, ts)
    mu, var = thinkstats2.MeanVar(residuals)
    sig = math.sqrt(var)

    # make the best fit
    fts, fys = MakeFit(model, ys, ts)

    # resample residuals and generate hypothetical fits
    fits = []
    for i in range(n):
        fake_ys = [fy + random.gauss(mu, sig) for fy in fys[:-1]]
        _, fake_fys = MakeFit(model, fake_ys, ts)
        fits.append(fake_fys)

    # find the 90% CI in each column
    columns = zip(*fits)

    sample_error = MakeStderr(columns)
    total_error = MakeStderr(columns, mu, var)

    return fts, sample_error, total_error
Esempio n. 3
0
def main(name, data_dir=''):
    pool, firsts, others = MakeTables(data_dir)

    for table in [pool, firsts, others]:
        print table.name, len(table.records),
        print len(table.ages), len(table.weights)

    # compute differences in mean age and weight
    age_diff = DifferenceInMeans(firsts, others, 'ages')
    weight_diff = DifferenceInMeans(firsts, others, 'weights')

    # get ages and weights
    ages, weights = GetAgeWeight(pool)
    print 'Mean, var weight', thinkstats2.MeanVar(weights)

    # compute a least squares fit
    inter, slope, R2 = ComputeLeastSquares(ages, weights)

    # see how much of the weight difference is explained by age
    weight_diff_explained = age_diff * slope
    print 'Weight difference explained by age:', weight_diff_explained
    print 'Fraction explained:', weight_diff_explained / weight_diff
    print

    # make a table of mean weight for 5-year age bins
    weight_dict = Partition(ages, weights)
    MakeLinePlot(weight_dict)

    # the correlations are slightly higher if we trim outliers
    ages, weights = GetAgeWeight(pool, low=4, high=12)
    inter, slope, R2 = ComputeLeastSquares(ages, weights)

    MakeFigures(pool, firsts, others)
Esempio n. 4
0
def MakeStderr(columns, mu2=0, var2=0):
    """Finds a confidence interval for each column.

    Returns two rows: the low end of the intervals and the high ends.
    """
    stats = [thinkstats2.MeanVar(ys) for ys in columns]

    min_fys = [mu1 + mu2 - 2 * math.sqrt(var1 + var2) for mu1, var1 in stats]
    max_fys = [mu1 + mu2 + 2 * math.sqrt(var1 + var2) for mu1, var1 in stats]
    return min_fys, max_fys
Esempio n. 5
0
def main():
    random.seed(17)

    rho = -0.8
    res = CorrelatedGenerator(1000, rho)
    xs, ys = zip(*res)

    a = 1.0
    b = 0.0
    xs = [a * x + b for x in xs]

    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'covariance', thinkstats2.Cov(xs, ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)
    print 'Spearman corr', thinkstats2.SpearmanCorr(xs, ys)

    thinkplot.Scatter(xs, ys)
    thinkplot.Show()
Esempio n. 6
0
def main():
    random.seed(17)

    rho = 0.8
    xs, ys = SatIqData(1000, rho)
    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)

    inter, slope = thinkstats2.LeastSquares(xs, ys)
    print 'inter', inter
    print 'slope', slope

    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    res = thinkstats2.Residuals(xs, ys, inter, slope)
    R2 = thinkstats2.CoefDetermination(ys, res)
    print 'R2', R2

    thinkplot.Plot(fxs, fys, color='gray', alpha=0.2)
    thinkplot.Scatter(xs, ys)
    thinkplot.Show()
Esempio n. 7
0
def main():
    thinkstats2.RandomSeed(17)

    # get the data
    live, firsts, others = first.MakeFrames()
    mean_var = thinkstats2.MeanVar(live.prglngth)
    print('(Mean, Var) of prglength for live births', mean_var)
    data = firsts.prglngth.values, others.prglngth.values

    # test the difference in means
    ht = DiffMeansPermute(data)
    p_value = ht.PValue(iters=1000)
    print('p-value =', p_value)

    ht.PlotCdf()
    thinkplot.Save(root='hypothesis1',
                   title='Permutation test',
                   xlabel='difference in means (weeks)',
                   ylabel='CDF',
                   legend=False)

    # test the difference in std
    ht = DiffStdPermute(data)
    p_value = ht.PValue(iters=1000)
    print('p-value =', p_value)

    ht.PlotCdf()
    thinkplot.Save(root='hypothesis2',
                   title='Permutation test',
                   xlabel='difference in std (weeks)',
                   ylabel='CDF',
                   legend=False)

    # test the difference in means by resampling
    ht = DiffStdPermute(data)
    p_value = ht.PValue(iters=1000)
    print('p-value =', p_value)

    ht.PlotCdf()
    thinkplot.Save(root='hypothesis3',
                   title='Resampling test',
                   xlabel='difference in means (weeks)',
                   ylabel='CDF',
                   legend=False)
Esempio n. 8
0
    xs = np.asarray(xs)
    ys = np.asarray(ys)

    if meanx is None:
        meanx = np.mean(xs)
    if meany is None:
        meany = np.mean(ys)

    cov = np.dot(xs-meanx, ys-meany) / len(xs)
    return cov

def Corr(xs, ys): # Pearson's
    xs = np.asarray(xs)
    ys = np.asarray(ys)

    meanx, varx = thinkstats2.MeanVar(xs)
    meany, vary = thinkstats2.MeanVar(ys)

    corr = Cov(xs, ys, meanx, meany) / np.sqrt(varx * vary)
    return corr
 
import pandas as pd

def SpearmanCorr(xs, ys):  # Spearmans'
    xranks = pd.Series(xs).rank()
    yranks = pd.Series(ys).rank()
    return Corr(xranks, yranks)

print('Corr', Corr(ages, weights))
print('SpearmanCorr', SpearmanCorr(ages, weights))
# RESULTS: Corr 0.0688339703541
    def testMeanVar(self):
        t = [1, 1, 1, 3, 3, 591]
        mean, var = thinkstats2.MeanVar(t)

        self.assertAlmostEqual(mean, 100.0)
        self.assertAlmostEqual(var, 48217.0)