Example #1
0
def Corr(xs, ys):
    """
    计算皮尔逊相关系数 (协方差--->标准分数)
    皮尔逊相关系数对异常值的影响很敏感
    """
    cov = correlation.Cov(xs, ys)
    _, xs_var = thinkstats.MeanVar(xs)
    _, ys_var = thinkstats.MeanVar(ys)
    return cov / (math.sqrt(xs_var * ys_var))
Example #2
0
def CoefDetermination(ys, res):
    """Computes the coefficient of determination (R^2) for given residuals.

    Args:
        ys: dependent variable
        res: residuals
        
    Returns:
        float coefficient of determination
    """
    ybar, vary = thinkstats.MeanVar(ys)
    resbar, varres = thinkstats.MeanVar(res)
    return 1 - varres / vary
Example #3
0
def ProcessScoresPairwise(pairs):
    """Average number of goals for each team against each opponent.

    pairs: map from (team1, team2) to (score1, score2)
    """
    # map from (team1, team2) to list of goals scored
    goals_scored = {}
    for key, entries in pairs.iteritems():
        t1, t2 = key
        for entry in entries:
            g1, g2 = entry
            goals_scored.setdefault((t1, t2), []).append(g1)
            goals_scored.setdefault((t2, t1), []).append(g2)

    # make a list of average goals scored
    lams = []
    for key, goals in goals_scored.iteritems():
        if len(goals) < 3:
            continue
        lam = thinkstats.Mean(goals)
        lams.append(lam)

    # make the distribution of average goals scored
    cdf = thinkbayes.MakeCdfFromList(lams)
    thinkplot.Cdf(cdf)
    thinkplot.Show()

    mu, var = thinkstats.MeanVar(lams)
    print 'mu, sig', mu, math.sqrt(var)

    print 'BOS v VAN', pairs['BOS', 'VAN']
Example #4
0
def ProcessScoresTeamwise(pairs):
    """Average number of goals for each team.

    pairs: map from (team1, team2) to (score1, score2)
    """
    # map from team to list of goals scored
    goals_scored = {}
    for key, entries in pairs.iteritems():
        t1, t2 = key
        for entry in entries:
            g1, g2 = entry
            goals_scored.setdefault(t1, []).append(g1)
            goals_scored.setdefault(t2, []).append(g2)

    # make a list of average goals scored
    lams = []
    for key, goals in goals_scored.iteritems():
        lam = thinkstats.Mean(goals)
        lams.append(lam)

    # make the distribution of average goals scored
    cdf = thinkbayes.MakeCdfFromList(lams)
    thinkplot.Cdf(cdf)
    thinkplot.Show()

    mu, var = thinkstats.MeanVar(lams)
    print('mu, sig', mu, math.sqrt(var))
Example #5
0
def process(data):
    # Hist 分布图
    hist = Pmf.MakeHistFromList(data, name='hist')
    myplot.Hist(hist, color='blue')
    myplot.Show()

    # Pmf 分布图
    pmf = Pmf.MakePmfFromHist(hist, name='pmf')
    myplot.Pmf(pmf, color='yellow')
    myplot.Show()

    myplot.Clf()

    # 实际数据的CDF分布图
    cdf = Cdf.MakeCdfFromList(data, name='loafs')
    myplot.Cdf(cdf)

    mu, var = thinkstats.MeanVar(data)
    sigma = math.sqrt(var)
    print("mu = %.3f, sigma = %.3f" % (mu, sigma))

    # 正态分布
    xs = normal_sample(len(data), mu, sigma)  # xs = data
    ys = [erf.NormalCdf(x, mu=mu, sigma=sigma) for x in xs]
    myplot.Scatter(xs, ys, color='red', label='sample')
    myplot.Show()
Example #6
0
def Corr(xs, ys):
    """Computes Cov(X, Y).

    Args:
        xs: seqeuence of values
        ys: seqeuence of values

    Returns:
        Corr(X, Y)
    """
    xbar, varx = thinkstats.MeanVar(xs)
    ybar, vary = thinkstats.MeanVar(ys)

    corr = Cov(xs, ys, xbar, ybar) / math.sqrt(varx * vary)

    return corr
Example #7
0
def PValue(model1, model2, n, m, delta, iters=1000):
    """Computes the distribution of deltas with the model distributions.

    And the p-value of the observed delta.

    Args:
        model1: 
        model2: sequences of values from the hypothetical distributions
        n: sample size from model1
        m: sample size from model2
        delta: the observed difference in the means
        iters: how many samples to generate
    """
    deltas = [Resample(model1, model2, n, m) for i in range(iters)]
    mean_var = thinkstats.MeanVar(deltas)
    print('(Mean, Var) of resampled deltas', mean_var)

    cdf = Cdf.MakeCdfFromList(deltas)

    # compute the two tail probabilities
    left = cdf.Prob(-delta)
    right = 1.0 - cdf.Prob(delta)

    pvalue = left + right
    print('Tails (left, right, total):', left, right, left + right)

    return cdf, pvalue
def MakeUniformPrior(t, num_points, label, spread=3.0):
    """Makes a prior distribution for mu and sigma based on a sample.

    t: sample
    num_points: number of values in each dimension
    label: string label for the new Pmf
    spread: number of standard errors to include

    Returns: Pmf that maps from (mu, sigma) to prob.
    """
    # estimate mean and stddev of t
    n = len(t)
    xbar, S2 = thinkstats.MeanVar(t)
    sighat = math.sqrt(S2)

    print xbar, sighat, sighat / xbar

    # compute standard error for mu and the range of ms
    stderr_xbar = sighat / math.sqrt(n)
    mspread = spread * stderr_xbar
    ms = numpy.linspace(xbar - mspread, xbar + mspread, num_points)

    # compute standard error for sigma and the range of ss
    stderr_sighat = sighat / math.sqrt(2 * (n - 1))
    sspread = spread * stderr_sighat
    ss = numpy.linspace(sighat - sspread, sighat + sspread, num_points)

    # populate the PMF
    pmf = Pmf.Pmf(name=label)
    for m in ms:
        for s in ss:
            pmf.Set((m, s), 1)
    return ms, ss, pmf
Example #9
0
def MakeErrorModel(model, ys, ts, n=100):
    """Makes a model that captures sample error and residual error.

    model: string representation of the regression model
    ys:    dependent variable
    ts:    explanatory variable
    n:     number of simulations to run

    Returns a pair of models, where each model is a pair of rows.
    """
    # estimate mean and stddev of the residuals
    residuals = Residuals(model, ys, ts)
    mu, var = thinkstats.MeanVar(residuals)
    sig = math.sqrt(var)

    # make the best fit
    fts, fys = MakeFit(model, ys, ts)

    # resample residuals and generate hypothetical fits
    fits = []
    for i in range(n):
        fake_ys = [fy + random.gauss(mu, sig) for fy in fys[:-1]]
        _, fake_fys = MakeFit(model, fake_ys, ts)
        fits.append(fake_fys)

    # find the 90% CI in each column
    columns = zip(*fits)

    sample_error = MakeStddev(columns)
    total_error = MakeStddev(columns, mu, var)

    return fts, sample_error, total_error
Example #10
0
def LeastSquares(xs, ys):
    """Computes a linear least squares fit for ys as a function of xs.

    Args:
        xs: sequence of values
        ys: sequence of values

    Returns:
        tuple of (intercept, slope)
    """
    xbar, varx = thinkstats.MeanVar(xs)
    ybar, vary = thinkstats.MeanVar(ys)

    slope = Cov(xs, ys, xbar, ybar) / varx
    inter = ybar - slope * xbar

    return inter, slope
def Process(table):
    '''返回 table 的分析结果
    参数: table 对象
    '''
    # 遍历数据表的所有记录,读取其中的妊娠期字段 prglength
    table.lengths = [p.prglength for p in table.records]
    table.n = len(table.records)
    table.mu = Mean(table.lengths)
    mv = thinkstats.MeanVar(table.lengths)
    table.sd = math.sqrt(mv[1])
Example #12
0
def accident_cdf(col_dict, attr='accidents'):
    """Makes a CDF of the number of accidents per day.

    col_dict: map from attribute name to column of data
    attr: string attribute name, one of accidents, injury, fatal
    """
    accidents = col_dict[attr]
    cdf = Cdf.MakeCdfFromList(accidents)
    print thinkstats.MeanVar(accidents)
    return cdf
Example #13
0
def MakeStddev(columns, mu2=0, var2=0):
    """Finds a confidence interval for each column.

    Returns two rows: the low end of the intervals and the high ends.
    """
    stats = [thinkstats.MeanVar(ys) for ys in columns]

    min_fys = [mu1 + mu2 - 2 * math.sqrt(var1 + var2) for mu1, var1 in stats]
    max_fys = [mu1 + mu2 + 2 * math.sqrt(var1 + var2) for mu1, var1 in stats]
    return min_fys, max_fys
Example #14
0
def Test(actual1, actual2, model, iters=1000):
    """Estimates p-values based on differences in the mean.
    
    Args:
        actual1:
        actual2: sequences of observed values for groups 1 and 2
        model: sequences of values from the hypothetical distribution
    """
    n = len(actual1)
    m = len(actual2)

    # compute delta
    mu1, mu2, delta = hypothesis.DifferenceInMean(actual1, actual2)
    delta = abs(delta)

    print('n:', n)
    print('m:', m)
    print('mu1', mu1)
    print('mu2', mu2)
    print('delta', delta)

    # compute the expected distribution of differences in sample mean
    mu_pooled, var_pooled = thinkstats.MeanVar(model)
    print('(Mean, Var) of pooled data', mu_pooled, var_pooled)

    f = 1.0 / n + 1.0 / m
    mu, var = (0, f * var_pooled)
    print('Expected Mean, Var of deltas', mu, var)

    # compute the p-value of delta in the observed distribution
    sigma = math.sqrt(var)
    left = erf.NormalCdf(-delta, mu, sigma)
    right = 1 - erf.NormalCdf(delta, mu, sigma)
    pvalue = left + right
    print('Tails:', left, right)
    print('p-value:', pvalue)

    # compare the mean and variance of resamples differences
    deltas = [hypothesis.Resample(model, model, n, m) for i in range(iters)]
    mean_var = thinkstats.MeanVar(deltas)
    print('(Mean, Var) of resampled deltas', mean_var)

    return pvalue
Example #15
0
def Corr(xs, ys):
    """Computes Corr(X, Y).

    Args:
        xs: sequence of values
        ys: sequence of values

    Returns:
        Corr(X, Y)
    """
    xbar, varx = thinkstats.MeanVar(xs)
    ybar, vary = thinkstats.MeanVar(ys)

    try:
        corr = Cov(xs, ys, xbar, ybar) / math.sqrt(varx * vary)
    except ZeroDivisionError as e:
        #  print(xs, ys)
        return 0
    return corr
Example #16
0
def Q1(people, prob, year):
    # 模拟次数
    simcount = 100
    results = []
    for c in range(simcount):
        results.append(process(people, prob, year))

    mu, var = thinkstats.MeanVar(results)
    sigma = math.sqrt(var)
    print("Q1: mu = %.3f, var = %.3f, sigma = %.3f" % (mu, var, sigma))
    return results
def Pumpkin():
    '''习题2 - 1
    计算南瓜重量的均值、方差和标准差。
    :return:
    '''
    tp = (1, 1, 1, 3, 3, 591)
    print('计算南瓜重量的均值、方差和标准差')
    mv = thinkstats.MeanVar(tp)
    print('均值 = ', mv[0])
    print('方差 = ', mv[1])
    σ = math.sqrt(mv[1])
    print('标准差 = ', σ)
Example #18
0
def PearsonSkewness(d):
    n = len(d)
    sorted(d)
    mean, var = thinkstats.MeanVar(d)
    sigma = math.sqrt(var)
    median = 0
    if n % 2:
        median = d[n / 2 + 1]
    else:
        median = (d[int(n / 2)] + d[int((n + 1) / 2)]) / 2

    print("median = %.3f", median)
    return 3 * (mean - median) / sigma
Example #19
0
def main():
    resp = brfss.Respondents()
    resp.ReadRecords(data_dir='res')
    heights, weights = resp.GetHeightAndWeight()
    c1 = Cov(heights, weights)
    c2 = Cov(heights, heights)
    _, var = thinkstats.MeanVar(heights)
    print(c1, c2, var)

    print("-------------- ")
    # 官方方法2
    c3 = correlation.Cov(heights, weights)
    c4 = correlation.Cov(heights, heights)
    print(c3, c4, var)
Example #20
0
def Summarize(data_dir):
    """Prints summary statistics for first babies and others.
    
    Returns:
        tuple of Tables
    """
    table, firsts, others = MakeTables(data_dir)
    ProcessTables(firsts, others)
    
    # 標準偏差と平均値の出力
    import thinkstats as ts
    birthlist = []
    for p in firsts.records:
        birthlist.append(p.prglength)

    mean , variance = ts.MeanVar(birthlist)
    print "standard:first",variance**0.5,mean

    birthlist = []
    for p in others.records:
        birthlist.append(p.prglength)

    mean , variance = ts.MeanVar(birthlist)
    print "standard:others",variance**0.5,mean

    # ここまで

    print 'Number of first babies', firsts.n
    print 'Number of others', others.n

    mu1, mu2 = firsts.mu, others.mu

    print 'Mean gestation in weeks:' 
    print 'First babies', mu1 
    print 'Others', mu2
    
    print 'Difference in days', (mu1 - mu2) * 7.0
Example #21
0
def main():
    random.seed(time.clock())
    pool, firsts, others = cumulative.MakeTables()
    mean_var = thinkstats.MeanVar(pool.lengths)
    print('(Mean, Var) of pooled data', mean_var)

    print("--------------4/4 ")

    # NSFG原始数据
    hypothesis.RunTest('length-4-4', 
            pool.lengths,
            firsts.lengths, 
            others.lengths, 
            iters=1000,
            trim=False,
            partition=False)

    print("--------------3/4 ")

    # NSFG数据 3/4
    hypothesis.RunTest('length-3-4', 
            hypothesis.SampleWithoutReplacement(pool.lengths, int(pool.n * 0.75)),
            hypothesis.SampleWithoutReplacement(firsts.lengths, int(firsts.n * 0.75)),
            hypothesis.SampleWithoutReplacement(others.lengths, int(others.n * 0.75)),
            iters=1000,
            trim=False,
            partition=False)

    print("--------------2/4 ")

    # NSFG数据 1/2
    hypothesis.RunTest('length-half-2-4', 
            hypothesis.SampleWithoutReplacement(pool.lengths, int(pool.n * 0.5)),
            hypothesis.SampleWithoutReplacement(firsts.lengths, int(firsts.n * 0.5)),
            hypothesis.SampleWithoutReplacement(others.lengths, int(others.n * 0.5)),
            iters=1000,
            trim=False,
            partition=False)

    print("--------------1/4 ")

    # NSFG数据 1/4
    hypothesis.RunTest('length-half-1-4', 
            hypothesis.SampleWithoutReplacement(pool.lengths, int(pool.n * 0.25)),
            hypothesis.SampleWithoutReplacement(firsts.lengths, int(firsts.n * 0.25)),
            hypothesis.SampleWithoutReplacement(others.lengths, int(others.n * 0.25)),
            iters=1000,
            trim=False,
            partition=False)
Example #22
0
def main():
    random.seed(1)

    # get the data
    pool, firsts, others = cumulative.MakeTables()
    mean_var = thinkstats.MeanVar(pool.lengths)
    print('(Mean, Var) of pooled data', mean_var)

    # run the test
    RunTest('length',
            pool.lengths,
            firsts.lengths,
            others.lengths,
            iters=1000,
            trim=False,
            partition=False)
Example #23
0
def main():
    firsts, others, babies = Babies.PartitionBabies()
    if p == 0:
        firsts_wtlist = Babies.GetWightList(firsts)
        others_wtlist = Babies.GetWightList(others)
        babies_wtlist = Babies.GetWightList(babies)
    else:
        firsts_wtlist = Babies.GetPregnacyList(firsts)
        others_wtlist = Babies.GetPregnacyList(others)
        babies_wtlist = Babies.GetPregnacyList(babies)

    print('(Mean, Var) of babies data', thinkstats.MeanVar(babies_wtlist))

    # P(E|H0)
    peh0 = hypothesis.Test("peh0",
                           firsts_wtlist,
                           others_wtlist,
                           babies_wtlist,
                           babies_wtlist,
                           iters=1000,
                           plot=False)

    # P(E|Ha)
    peha = hypothesis.Test("peh0",
                           firsts_wtlist,
                           others_wtlist,
                           firsts_wtlist,
                           others_wtlist,
                           iters=1000,
                           plot=False)

    # P(HA)
    pha = 0.5

    # P(E)
    pe = peha * pha + peh0 * (1 - pha)

    # P(Ha|E)
    phae = (peha * pha) / pe
    print("pha = %.3f, peh0 = %.3f, peha = %.3f, phae = %.3f" %
          (pha, peh0, peha, phae))
Example #24
0
def Pumpkin(data):
    mu, var = thinkstats.MeanVar(data)
    svar = math.sqrt(var)
    print("Pumpkin mean:{} variance:{} normal variance:{}".format(
        mu, var, svar))
Example #25
0
def standVar(first, others):
    import thinkstats as stats
    import math
    fmu, fvar = stats.MeanVar(first)
    omu, ovar = stats.MeanVar(others)
    return math.sqrt(fvar), math.sqrt(ovar), fmu, omu
Example #26
0
def Summarize(srcs):
    """Computes the number of edges for each source."""
    lens = [len(t) for t in srcs.values()]
    mu, sigma2 = thinkstats.MeanVar(lens)
    print(mu, math.sqrt(sigma2))
    return lens
Example #27
0
def Process(table):
    table.lengths = [p.prglength for p in table.records]
    table.n = len(table.lengths)
    table.mu, table.var = thinkstats.MeanVar(table.lengths)
    table.svar = math.sqrt(table.var)
def Pumpkin():
    weights = [p.weight for p in pumpkins]
    mu, var = thinkstats.MeanVar(weights)
    s = math.sqrt(var)
    return (mu, var, s)
Example #29
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-

import Pmf
import thinkstats

def PmfMean(pmf):
    mu = 0.0
    for key, val in pmf.Items():
        mu += key * val
    return mu

def PmfVar(pmf, mu):
    var = 0.0
    for key, val in pmf.Items():
        var += val * (key - mu)**2
    return var

if __name__ == "__main__":
    data=(1, 1, 1, 3, 3, 591)
    mu, var = thinkstats.MeanVar(data)
    print("Mean:{} Var:{}".format(mu, var))
    print("-------------- ")
    pmf = Pmf.MakePmfFromList(data, name='test')
    mu = PmfMean(pmf)
    var = PmfVar(pmf, mu)
    print("Mean:{} Var:{}".format(mu, var))
    print("-------------- ")
    print("Mean:{} Var:{}".format(pmf.Mean(), pmf.Var(mu=None)))
Example #30
0
print 'Number of pregnancies', len(pregnancies.records)
print

preg_lengths_first = []
preg_lengths_others = []

for preg in pregnancies.records:
    if preg.outcome != 1:
        continue
    if preg.birthord == 1:
        preg_lengths_first.append(preg.prglength)
    else:
        preg_lengths_others.append(preg.prglength)

pregs_first = len(preg_lengths_first)
mean_length_first, var_length_first = thinkstats.MeanVar(preg_lengths_first)
std_length_first = math.sqrt(var_length_first)

pregs_others = len(preg_lengths_others)
mean_length_others, var_length_others = thinkstats.MeanVar(preg_lengths_others)
std_length_others = math.sqrt(var_length_others)

print 'Number of live births, first child', pregs_first
print 'Mean pregnancy length (weeks), first child', mean_length_first
print 'Variance of gestation time, first child', var_length_first
print 'Standard deviation of gestation time, first child', std_length_first
print
print 'Number of live births, other children', pregs_others
print 'Mean pregnancy length (weeks), other children', mean_length_others
print 'Variance of gestation time, other children', var_length_others
print 'Standard deviation of gestation time, other children', std_length_others