def processScoresTeamwise(pairs): """Average number of goals for each team. pairs: map from (team1, team2) to (score1, score2) """ # map from team to list of goals scored goals_scored = {} for key, entries in pairs.iteritems(): t1, t2 = key for entry in entries: g1, g2 = entry goals_scored.setdefault(t1, []).append(g1) goals_scored.setdefault(t2, []).append(g2) # make a list of average goals scored lams = [] for key, goals in iter(goals_scored): lam = thinkstats.mean(goals) lams.append(lam) # make the distribution of average goals scored cdf = thinkbayes.makeCdfFromList(lams) thinkplot.cdf(cdf) thinkplot.show() mu, var = thinkstats.meanAndVariance(lams) print('mu, sig', mu, math.sqrt(var))
def runSimpleProcess(gap_times, lmbda=0.0333, num_passengers=15, plot=True): """Runs the basic analysis and generates figures. gap_times: sequence of float lam: arrival rate in passengers per second num_passengers: int number of passengers on the platform plot: boolean, whether to generate plots Returns: WaitTimeCalculator, ElapsedTimeEstimator """ global UPPER_BOUND UPPER_BOUND = 1200 cdf_z = thinkbayes.makeCdfFromList(gap_times).scale(1.0 / 60) print('CI z', cdf_z.credibleInterval(90)) xs = makeRange(low=10) pdf_z = thinkbayes.EstimatedPDF(gap_times) pmf_z = pdf_z.makePmf(xs, name="z") wtc = WaitTimeCalculator(pmf_z, inverse=False) if plot: wtc.plotPmfs() wtc.makePlot() ete = ElapsedTimeEstimator(wtc, lmbda, num_passengers) if plot: ete.makePlot() return wtc, ete
def summarize(xs): """Prints summary statistics from a sequence of values. xs: sequence of values """ # print smallest and largest xs.sort() print('smallest', xs[:10]) print('largest', xs[-10:]) # print median and interquartile range cdf = thinkbayes.makeCdfFromList(xs) print(cdf.percentile(25), cdf.percentile(50), cdf.percentile(75))
def __init__(self, prices, bids, diffs): """Construct the Player. prices: sequence of prices bids: sequence of bids diffs: sequence of underness (negative means over) """ self.pdf_price = thinkbayes.EstimatedPDF(prices) self.cdf_diff = thinkbayes.makeCdfFromList(diffs) mu = 0 sigma = numpy.std(diffs) self.pdf_error = thinkbayes.GaussianPDF(mu, sigma)
def medianIPR(xs, p): """Computes the median and interpercentile range. xs: sequence of values p: range (0-1), 0.5 yields the interquartile range returns: tuple of float (median, IPR) """ cdf = thinkbayes.makeCdfFromList(xs) median = cdf.percentile(50) alpha = (1 - p) / 2 ipr = cdf.value(1 - alpha) - cdf.value(alpha) return median, ipr
def plotOutliers(samples): """Make CDFs showing the distribution of outliers.""" cdfs = [] for label, sample in samples.iteritems(): outliers = [x for x in sample if x < 150] cdf = thinkbayes.makeCdfFromList(outliers, label) cdfs.append(cdf) thinkplot.clf() thinkplot.cdfs(cdfs) thinkplot.save(root='variability_cdfs', title='CDF of height', xlabel='Reported height (cm)', ylabel='CDF')
def plotCdfs(d, labels): """Plot CDFs for each sequence in a dictionary. Jitters the data and subtracts away the mean. d: map from key to sequence of values labels: map from key to string label """ thinkplot.clf() for key, xs in d.iteritems(): mu = thinkstats.mean(xs) xs = thinkstats.jitter(xs, 1.3) xs = [x - mu for x in xs] cdf = thinkbayes.makeCdfFromList(xs) thinkplot.cdf(cdf, label=labels[key]) thinkplot.show()
def runLoop(gap_times, nums, lmbda=0.0333): """Runs the basic analysis for a range of num_passengers. gap_times: sequence of float nums: sequence of values for num_passengers lam: arrival rate in passengers per second Returns: WaitMixtureEstimator """ global UPPER_BOUND UPPER_BOUND = 4000 thinkplot.clf() randomSeed(18) # resample gap_times n = 220 cdf_z = thinkbayes.makeCdfFromList(gap_times) sample_z = cdf_z.sample(n) pmf_z = thinkbayes.makePmfFromList(sample_z) # compute the biased pmf and add some long delays cdf_zp = biasPmf(pmf_z).makeCdf() sample_zb = cdf_zp.sample(n) + [1800, 2400, 3000] # smooth the distribution of zb pdf_zb = thinkbayes.EstimatedPDF(sample_zb) xs = makeRange(low=60) pmf_zb = pdf_zb.makePmf(xs) # unbias the distribution of zb and make wtc pmf_z = unbiasPmf(pmf_zb) wtc = WaitTimeCalculator(pmf_z) # NOTE: THis is the prob of long wait part on page 89 # Given number of passengers on platform, problongwait makes an # * elapsedtimeestimator # * extracts dist of wait time (y) # * compute probability that wait time exceeds minutes (15 here) # RESULT PLOT: when passgrs num < 20, system isoperating normally so prob of long delay is small # But if greater than 30 pssgrs, then it has been 15 mins since last train, which is longer than # normal delay so need to take taxi. probs = [] for num_passengers in nums: ete = ElapsedTimeEstimator(wtc, lmbda, num_passengers) # compute the posterior prob of waiting more than 15 minutes cdf_y = ete.pmf_y.makeCdf() prob = 1 - cdf_y.prob(900) probs.append(prob) # thinkplot.Cdf(ete.pmf_y.MakeCdf(name=str(num_passengers))) thinkplot.plot(nums, probs) thinkplot.save( root='redline5', xlabel='Num passengers', ylabel='P(y > 15 min)', formats=FORMATS, )