Esempio n. 1
0
def SummarizeWeight(rows, input_limit=None):
    years = [
        1981, 1982, 1985, 1986, 1988, 1989, 1990, 1992, 1993, 1994, 1996, 1998,
        2000, 2002, 2004, 2006, 2008
    ]

    all_diffs = []

    for i, row in enumerate(rows):
        if i == input_limit:
            break

        id, race, sex = row[:3]
        weights = row[3:]
        print id
        diffs = Differences(years, weights, jitter=3)
        all_diffs.extend(diffs)

    weights, changes = zip(*all_diffs)

    print 'Mean weight', thinkstats.Mean(weights)
    print 'Mean change', thinkstats.Mean(changes)
    print numpy.corrcoef(weights, changes)

    pyplot.hexbin(weights, changes, cmap=matplotlib.cm.gray_r)
    myplot.Save(
        'nlsy_scatter',
        title='Weight change vs. weight',
        xlabel='Current weight (pounds)',
        ylabel='Weight change (pounds)',
        axis=[70, 270, -25, 25],
        legend=False,
        show=True,
    )
Esempio n. 2
0
def main():
    unchange_rate, change_rate = [], []
    for i in range(1000):
        unchange, change = process()
        unchange_rate.append(unchange)
        change_rate.append(change)

    print("Mean(Unchange) = %.3f \t Mean(Change) = %.3f" %
          (thinkstats.Mean(unchange_rate), thinkstats.Mean(change_rate)))
Esempio n. 3
0
def Process(table):
    """Runs analysis on the given table.
    
    Args:
        table: table object
    """
    table.lengths = [p.prglength for p in table.records]
    table.n = len(table.lengths)
    table.mu = thinkstats.Mean(table.lengths)
    table.std = math.sqrt(thinkstats.Mean(table.lengths))
Esempio n. 4
0
def Cov(xs, ys):
    """
    协方差, 算出来的值很难看, 单位也没意义.
    """
    xn = len(xs)
    yn = len(ys)
    if xn != yn:
        return 0
    x_mu = thinkstats.Mean(xs)
    y_mu = thinkstats.Mean(ys)
    t = [(x - x_mu) * (y - y_mu) for x, y in zip(xs, ys)]
    return sum(t) / xn
Esempio n. 5
0
def DifferenceInMean(actual1, actual2):
    """Computes the difference in mean between two groups.

    Args:
        actual1: sequence of float
        actual2: sequence of float

    Returns:
        tuple of (mu1, mu2, mu1-mu2)
    """
    mu1 = thinkstats.Mean(actual1)
    mu2 = thinkstats.Mean(actual2)
    delta = mu1 - mu2
    return mu1, mu2, delta
Esempio n. 6
0
def ex1_3_4():
    first_list = []
    other_list = []

    for r in TABLE.records:
        if r.outcome == 1:
            if r.birthord == 1:
                first_list.append(r.prglength)
            else:
                other_list.append(r.prglength)

    return {
        'avg_first': thinkstats.Mean(first_list),
        'avg_other': thinkstats.Mean(other_list)
    }
Esempio n. 7
0
def main():
    simcount = 1000  # 模拟次数
    counts = [1, 82]  # 比赛场次1次和82次两种
    for m in counts:
        results = [matches(m, 10, 15, 0.5) for i in range(simcount)]
        mu = thinkstats.Mean(results)
        print("matches %2d: prob = %.3f%% " % (m, mu * 100))
Esempio n. 8
0
def ProcessScoresPairwise(pairs):
    """Average number of goals for each team against each opponent.

    pairs: map from (team1, team2) to (score1, score2)
    """
    # map from (team1, team2) to list of goals scored
    goals_scored = {}
    for key, entries in pairs.iteritems():
        t1, t2 = key
        for entry in entries:
            g1, g2 = entry
            goals_scored.setdefault((t1, t2), []).append(g1)
            goals_scored.setdefault((t2, t1), []).append(g2)

    # make a list of average goals scored
    lams = []
    for key, goals in goals_scored.iteritems():
        if len(goals) < 3:
            continue
        lam = thinkstats.Mean(goals)
        lams.append(lam)

    # make the distribution of average goals scored
    cdf = thinkbayes.MakeCdfFromList(lams)
    thinkplot.Cdf(cdf)
    thinkplot.Show()

    mu, var = thinkstats.MeanVar(lams)
    print 'mu, sig', mu, math.sqrt(var)

    print 'BOS v VAN', pairs['BOS', 'VAN']
Esempio n. 9
0
def DifferenceInMeans(firsts, others, attr):
    """Compute the difference in means between tables for a given attr.

    Prints summary statistics.
    """
    firsts_mean = thinkstats.Mean(getattr(firsts, attr))
    print 'First babies, %s, trimmed mean:' % attr, firsts_mean

    others_mean = thinkstats.Mean(getattr(others, attr))
    print 'Other babies, %s, trimmed mean:' % attr, others_mean

    diff = others_mean - firsts_mean
    print 'Difference in means:', diff
    print

    return diff
Esempio n. 10
0
def RunFit(xs, ys):
    inter, slope, R2 = Fit(xs, ys)
    fxs = [min(xs), max(xs)]
    fys = [inter + slope * x for x in fxs]
    pyplot.plot(fxs, fys, 'r-', linewidth=2)
    print 'Mean diff', thinkstats.Mean(ys)
    print 'Current rate:', fys[-1]
Esempio n. 11
0
def ProcessScoresTeamwise(pairs):
    """Average number of goals for each team.

    pairs: map from (team1, team2) to (score1, score2)
    """
    # map from team to list of goals scored
    goals_scored = {}
    for key, entries in pairs.iteritems():
        t1, t2 = key
        for entry in entries:
            g1, g2 = entry
            goals_scored.setdefault(t1, []).append(g1)
            goals_scored.setdefault(t2, []).append(g2)

    # make a list of average goals scored
    lams = []
    for key, goals in goals_scored.iteritems():
        lam = thinkstats.Mean(goals)
        lams.append(lam)

    # make the distribution of average goals scored
    cdf = thinkbayes.MakeCdfFromList(lams)
    thinkplot.Cdf(cdf)
    thinkplot.Show()

    mu, var = thinkstats.MeanVar(lams)
    print('mu, sig', mu, math.sqrt(var))
Esempio n. 12
0
def p_value(delta, pcs, num_questions, iterations=10000):
    """Computes the probability of seeing a mean difference in exam
    scores >= delta for students with given values of pc."""
    count = 0.0
    for i in range(iterations):
        if thinkstats.Mean(fake_diffs(pcs, num_questions)) >= delta:
            count += 1
    return count / iterations
Esempio n. 13
0
def main2():
    n = 1000
    criteria = 10
    matches = [1, 82]
    results, prob = [], []
    for m in matches:
        results = [match(criteria, m) for i in range(n)]
        prob = thinkstats.Mean(results) * 100
        print('matches %2d: prob = %.3f%%' % (m, prob))
Esempio n. 14
0
 def testVar(self):
     t = [1, 1, 1, 3, 3, 591]
     mu = thinkstats.Mean(t)
     var1 = thinkstats.Var(t)
     var2 = thinkstats.Var(t, mu)
     
     self.assertAlmostEqual(mu, 100.0)
     self.assertAlmostEqual(var1, 48217.0)
     self.assertAlmostEqual(var2, 48217.0)
Esempio n. 15
0
    def SummarizeWeightChange(self):
        """Print the mean reported change in weight in kg."""

        data = [(r.weight2, r.wtyrago) for r in self.records
                if r.weight2 != 'NA' and r.wtyrago != 'NA']

        changes = [(curr - prev) for curr, prev in data]

        print 'Mean change', thinkstats.Mean(changes)
Esempio n. 16
0
def MakeFigure():
    fp = open('babyboom.dat')
    
    # skip to the beginning of the data
    for line in fp:
        if line.find('START DATA') != -1:
            break
    
    # read a list of times
    times = []
    for line in fp:
        t = line.split()
        time = int(t[-1])
        times.append(time)
    
    # compute interarrival times
    diffs = [times[0]]
    for i in range(len(times)-1):
        diff = times[i+1] - times[i]
        diffs.append(diff)
    
    n = len(diffs)
    mu = thinkstats.Mean(diffs)
        
    print 'mean interarrival time', mu
    
    cdf = Cdf.MakeCdfFromList(diffs, 'actual')

    sample = [random.expovariate(1/mu) for i in range(n)]
    model = Cdf.MakeCdfFromList(sample, 'model')
    
    myplot.Cdf(cdf)
    myplot.Save(root='interarrivals',
              title='Time between births',
              xlabel='minutes',
              ylabel='CDF',
              legend=False,
                formats=['eps', 'png', 'pdf'])

    myplot.Cdfs([cdf, model], complement=True)
    myplot.Save(root='interarrivals_model',
                title='Time between births',
                xlabel='minutes',
                ylabel='Complementary CDF',
                yscale='log',
                formats=['eps', 'png', 'pdf'])

    pyplot.subplots_adjust(bottom=0.11)
    myplot.Cdf(cdf, complement=True)
    myplot.Save(root='interarrivals_logy',
                title='Time between births',
                xlabel='minutes',
                ylabel='Complementary CDF',
                yscale='log',
                legend=False,
                formats=['eps', 'png', 'pdf'])
Esempio n. 17
0
def main():
    lam = 0.5
    n = 10
    lams1 = []
    lams2 = []
    for _ in range(1000):
        data = samples('expo', n=n, lam=lam)
        s_mean = thinkstats.Mean(data)
        s_median = thinkstats.Median(data)
        lams1.append(1 / s_mean)
        lams2.append(math.log(2) / s_median)

    s_lam_mean1 = thinkstats.Mean(lams1)
    s_lam_mean2 = thinkstats.Mean(lams2)
    print("Mean(lams1) = %.3f, Mean(lams2) = %.3f" %
          (s_lam_mean1, s_lam_mean2))

    print("MSE(lams1) = %.3f, MSE(lams2) = %.3f" %
          (CalculateMSE(lams1, lam), CalculateMSE(lams2, lam)))
Esempio n. 18
0
def pumpkin(weights):
    """
    Given an iterable of pumpkin weights, compute the sequence's mean,
    variance, and standard deviation.
    """
    mean = thinkstats.Mean(weights)
    variance = thinkstats.Var(weights, mean)
    stddev = std_dev(weights, mean, variance)

    return mean, variance, stddev
Esempio n. 19
0
def Process(
    table
):  # Does this return anything? It does give us summary data that we can call later. Is it OK if a method doesn't return anything?
    """Runs analysis on the given table.
    
    Args:
        table: table object
    """
    table.lengths = [p.prglength for p in table.records]
    table.n = len(table.lengths)
    table.mu = thinkstats.Mean(table.lengths)
Esempio n. 20
0
def MakeLinePlot(age_bins):
    xs = []
    ys = []
    for bin, weights in sorted(age_bins.iteritems()):
        xs.append(bin)
        ys.append(thinkstats.Mean(weights))

    myplot.Plot(xs, ys, 'bs-')
    myplot.Save(root='agemodel_line',
                xlabel="Mother's age (years)",
                ylabel='Mean birthweight (oz)',
                legend=False)
Esempio n. 21
0
def EstimateRankits(n=6, m=1000):
    """Estimates the expected values of sorted random samples.

    n: sample size
    m: number of iterations

    Returns: list of n rankits
    """
    t = Samples(n, m)
    t = zip(*t)
    means = [thinkstats.Mean(x) for x in t]
    return means
Esempio n. 22
0
def Cov(xs, ys, mux=None, muy=None):
    """Computes Cov(X, Y).

    Args:
        xs: sequence of values
        ys: sequence of values
        mux: optional float mean of xs
        muy: optional float mean of ys

    Returns:
        Cov(X, Y)
    """
    if mux is None:
        mux = thinkstats.Mean(xs)
    if muy is None:
        muy = thinkstats.Mean(ys)

    total = 0.0
    for x, y in zip(xs, ys):
        total += (x - mux) * (y - muy)

    return total / len(xs)
Esempio n. 23
0
    def testMeanAndVar(self):
        t = [1, 2, 2, 3, 5]
        mu = thinkstats.Mean(t)
        var = thinkstats.Var(t, mu)

        pmf = Pmf.MakePmfFromList(t)
        mu2 = pmf.Mean()
        var2 = pmf.Var()
        var3 = pmf.Var(mu2)

        self.assertAlmostEquals(mu, mu2)
        self.assertAlmostEquals(var, var2)
        self.assertAlmostEquals(var, var3)
Esempio n. 24
0
def Skewness(d):
    n = len(d)
    mean = thinkstats.Mean(d)

    m2 = 0
    m3 = 0
    for x in d:
        t = x - mean
        m2 += t**2
        m3 += t**3
    m2 /= n
    m3 /= n

    return m3 / (m2**(3 / 2))
Esempio n. 25
0
def main():
    for n in range(1, 20):
        m = []
        # 一年中, 每天从样本n中选择最大的, 组成新的样本m
        for d in range(365):
            l = normal_sample(n, 950, 50)
            m.append(int(max(l)))

        # 计算样本m的mu, sigma
        mu = thinkstats.Mean(m)

        # 如果mu大于1000, 停止试验, 画图cdf
        if mu >= 1000:
            process(m)
            break
Esempio n. 26
0
    def testVar(self):
        t = [1, 1, 1, 3, 3, 591]
        mu = thinkstats.Mean(t)
        var1 = thinkstats.Var(t)
        var2 = thinkstats.Var(t, mu)
        
        print
        print 'Pumpkins'
        print 'mean', mu 
        print 'var', var1
        print 'var', var2

        self.assertAlmostEquals(mu, 100.0)
        self.assertAlmostEquals(var1, 48217.0)
        self.assertAlmostEquals(var2, 48217.0)
Esempio n. 27
0
def main():
    suite = MakeUniformSuite(0.001, 1.5, 1000)
    evidence = [1.5, 2, 3, 4, 5, 12]

    Update(suite, evidence)
    suite.name = 'posterior'

    # plot the posterior distributions
    myplot.Pmf(suite)
    myplot.Show(title='Decay parameter',
                xlabel='Parameter (inverse cm)',
                ylabel='Posterior probability')

    print 'Naive parameter estimate:', 1.0 / thinkstats.Mean(evidence)
    print 'Mean of the posterior distribution:', suite.Mean()
Esempio n. 28
0
def PlotCdfs(d, labels):
    """Plot CDFs for each sequence in a dictionary.

    Jitters the data and subtracts away the mean.

    d: map from key to sequence of values
    labels: map from key to string label
    """
    thinkplot.Clf()
    for key, xs in d.iteritems():
        mu = thinkstats.Mean(xs)
        xs = thinkstats.Jitter(xs, 1.3)
        xs = [x - mu for x in xs]
        cdf = thinkbayes.MakeCdfFromList(xs)
        thinkplot.Cdf(cdf, label=labels[key])
    thinkplot.Show()
Esempio n. 29
0
def Partition(ages, weights, bin_size=5):
    weight_dict = {}
    for age, weight in zip(ages, weights):
        bin = math.floor(age / bin_size)
        weight_dict.setdefault(bin, []).append(weight)

    print 'Bin', 'Mean weight (oz)'
    for bin, bin_weights in weight_dict.iteritems():
        age = bin * bin_size
        try:
            mean = thinkstats.Mean(bin_weights)
            print age, mean
        except ZeroDivisionError:
            continue

    print
    return weight_dict
Esempio n. 30
0
def Partition(ages, weights, bin_size=2):
    """Break ages into bins.

    Returns a map from age to list of weights.
    """
    weight_dict = {}
    for age, weight in zip(ages, weights):
        bin = bin_size * math.floor(age / bin_size) + bin_size / 2.0
        weight_dict.setdefault(bin, []).append(weight)

    for bin, bin_weights in weight_dict.iteritems():
        try:
            mean = thinkstats.Mean(bin_weights)
        except ZeroDivisionError:
            continue

    return weight_dict