Example #1
0
def processScoresTeamwise(pairs):
    """Average number of goals for each team.

    pairs: map from (team1, team2) to (score1, score2)
    """
    # map from team to list of goals scored
    goals_scored = {}
    for key, entries in pairs.iteritems():
        t1, t2 = key
        for entry in entries:
            g1, g2 = entry
            goals_scored.setdefault(t1, []).append(g1)
            goals_scored.setdefault(t2, []).append(g2)

    # make a list of average goals scored
    lams = []
    for key, goals in iter(goals_scored):
        lam = thinkstats.mean(goals)
        lams.append(lam)

    # make the distribution of average goals scored
    cdf = thinkbayes.makeCdfFromList(lams)
    thinkplot.cdf(cdf)
    thinkplot.show()

    mu, var = thinkstats.meanAndVariance(lams)
    print('mu, sig', mu, math.sqrt(var))
Example #2
0
def runSimpleProcess(gap_times, lmbda=0.0333, num_passengers=15, plot=True):
    """Runs the basic analysis and generates figures.

    gap_times: sequence of float
    lam: arrival rate in passengers per second
    num_passengers: int number of passengers on the platform
    plot: boolean, whether to generate plots

    Returns: WaitTimeCalculator, ElapsedTimeEstimator
    """
    global UPPER_BOUND
    UPPER_BOUND = 1200

    cdf_z = thinkbayes.makeCdfFromList(gap_times).scale(1.0 / 60)
    print('CI z', cdf_z.credibleInterval(90))

    xs = makeRange(low=10)

    pdf_z = thinkbayes.EstimatedPDF(gap_times)
    pmf_z = pdf_z.makePmf(xs, name="z")

    wtc = WaitTimeCalculator(pmf_z, inverse=False)

    if plot:
        wtc.plotPmfs()
        wtc.makePlot()

    ete = ElapsedTimeEstimator(wtc, lmbda, num_passengers)

    if plot:
        ete.makePlot()

    return wtc, ete
Example #3
0
def summarize(xs):
    """Prints summary statistics from a sequence of values.

    xs: sequence of values
    """
    # print smallest and largest
    xs.sort()
    print('smallest', xs[:10])
    print('largest', xs[-10:])

    # print median and interquartile range
    cdf = thinkbayes.makeCdfFromList(xs)
    print(cdf.percentile(25), cdf.percentile(50), cdf.percentile(75))
    def __init__(self, prices, bids, diffs):
        """Construct the Player.

        prices: sequence of prices
        bids: sequence of bids
        diffs: sequence of underness (negative means over)
        """
        self.pdf_price = thinkbayes.EstimatedPDF(prices)
        self.cdf_diff = thinkbayes.makeCdfFromList(diffs)

        mu = 0
        sigma = numpy.std(diffs)
        self.pdf_error = thinkbayes.GaussianPDF(mu, sigma)
Example #5
0
def medianIPR(xs, p):
    """Computes the median and interpercentile range.

    xs: sequence of values
    p: range (0-1), 0.5 yields the interquartile range

    returns: tuple of float (median, IPR)
    """
    cdf = thinkbayes.makeCdfFromList(xs)
    median = cdf.percentile(50)

    alpha = (1 - p) / 2
    ipr = cdf.value(1 - alpha) - cdf.value(alpha)
    return median, ipr
Example #6
0
def plotOutliers(samples):
    """Make CDFs showing the distribution of outliers."""
    cdfs = []
    for label, sample in samples.iteritems():
        outliers = [x for x in sample if x < 150]

        cdf = thinkbayes.makeCdfFromList(outliers, label)
        cdfs.append(cdf)

    thinkplot.clf()
    thinkplot.cdfs(cdfs)
    thinkplot.save(root='variability_cdfs',
                   title='CDF of height',
                   xlabel='Reported height (cm)',
                   ylabel='CDF')
Example #7
0
def plotCdfs(d, labels):
    """Plot CDFs for each sequence in a dictionary.

    Jitters the data and subtracts away the mean.

    d: map from key to sequence of values
    labels: map from key to string label
    """
    thinkplot.clf()
    for key, xs in d.iteritems():
        mu = thinkstats.mean(xs)
        xs = thinkstats.jitter(xs, 1.3)
        xs = [x - mu for x in xs]
        cdf = thinkbayes.makeCdfFromList(xs)
        thinkplot.cdf(cdf, label=labels[key])
    thinkplot.show()
Example #8
0
def runLoop(gap_times, nums, lmbda=0.0333):
    """Runs the basic analysis for a range of num_passengers.

    gap_times: sequence of float
    nums: sequence of values for num_passengers
    lam: arrival rate in passengers per second

    Returns: WaitMixtureEstimator
    """
    global UPPER_BOUND
    UPPER_BOUND = 4000

    thinkplot.clf()

    randomSeed(18)

    # resample gap_times
    n = 220
    cdf_z = thinkbayes.makeCdfFromList(gap_times)
    sample_z = cdf_z.sample(n)
    pmf_z = thinkbayes.makePmfFromList(sample_z)

    # compute the biased pmf and add some long delays
    cdf_zp = biasPmf(pmf_z).makeCdf()
    sample_zb = cdf_zp.sample(n) + [1800, 2400, 3000]

    # smooth the distribution of zb
    pdf_zb = thinkbayes.EstimatedPDF(sample_zb)
    xs = makeRange(low=60)
    pmf_zb = pdf_zb.makePmf(xs)

    # unbias the distribution of zb and make wtc
    pmf_z = unbiasPmf(pmf_zb)
    wtc = WaitTimeCalculator(pmf_z)

    # NOTE: THis is the prob of long wait part on page 89
    # Given number of passengers on platform, problongwait makes an
    # * elapsedtimeestimator
    # * extracts dist of wait time (y)
    # * compute probability that wait time exceeds minutes (15 here)
    # RESULT PLOT: when passgrs num < 20, system isoperating normally so prob of long delay is small
    # But if greater than 30 pssgrs, then it has been 15 mins since last train, which is longer than
    # normal delay so need to take taxi.
    probs = []
    for num_passengers in nums:
        ete = ElapsedTimeEstimator(wtc, lmbda, num_passengers)

        # compute the posterior prob of waiting more than 15 minutes
        cdf_y = ete.pmf_y.makeCdf()
        prob = 1 - cdf_y.prob(900)
        probs.append(prob)

        # thinkplot.Cdf(ete.pmf_y.MakeCdf(name=str(num_passengers)))

    thinkplot.plot(nums, probs)
    thinkplot.save(
        root='redline5',
        xlabel='Num passengers',
        ylabel='P(y > 15 min)',
        formats=FORMATS,
    )