def ProcessScoresTeamwise(pairs): """Average number of goals for each team. pairs: map from (team1, team2) to (score1, score2) """ # map from team to list of goals scored goals_scored = {} for key, entries in pairs.iteritems(): t1, t2 = key for entry in entries: g1, g2 = entry goals_scored.setdefault(t1, []).append(g1) goals_scored.setdefault(t2, []).append(g2) # make a list of average goals scored lams = [] for key, goals in goals_scored.iteritems(): lam = thinkbayes2.Mean(goals) lams.append(lam) # make the distribution of average goals scored cdf = thinkbayes2.MakeCdfFromList(lams) thinkplot.Cdf(cdf) thinkplot.Show() mu, var = thinkbayes2.MeanVar(lams) print('mu, sig', mu, math.sqrt(var))
def __init__(self, prices, bids, diffs): """Construct the Player. prices: sequence of prices bids: sequence of bids diffs: sequence of underness (negative means over) """ self.pdf_price = thinkbayes2.EstimatedPdf(prices) self.cdf_diff = thinkbayes2.MakeCdfFromList(diffs) mu = 0 sigma = numpy.std(diffs) self.pdf_error = thinkbayes2.NormalPdf(mu, sigma)
def Summarize(xs): """Prints summary statistics from a sequence of values. xs: sequence of values """ # print smallest and largest xs.sort() print('smallest', xs[:10]) print('largest', xs[-10:]) # print median and interquartile range cdf = thinkbayes2.MakeCdfFromList(xs) print(cdf.Percentile(25), cdf.Percentile(50), cdf.Percentile(75))
def GenerateCdf(n=1000, pc=0.35, lam1=0.79, lam2=5.0): """Generates a sample of RDTs and returns its CDF. n: sample size pc: probablity of negative growth lam1: exponential parameter of positive growth lam2: exponential parameter of negative growth Returns: Cdf of generated sample """ xs = GenerateSample(n, pc, lam1, lam2) cdf = thinkbayes2.MakeCdfFromList(xs) return cdf
def MedianIPR(xs, p): """Computes the median and interpercentile range. xs: sequence of values p: range (0-1), 0.5 yields the interquartile range returns: tuple of float (median, IPR) """ cdf = thinkbayes2.MakeCdfFromList(xs) median = cdf.Percentile(50) alpha = (1 - p) / 2 ipr = cdf.Value(1 - alpha) - cdf.Value(alpha) return median, ipr
def PlotOutliers(samples): """Make CDFs showing the distribution of outliers.""" cdfs = [] for label, sample in samples.items(): outliers = [x for x in sample if x < 150] cdf = thinkbayes2.MakeCdfFromList(outliers, label) cdfs.append(cdf) thinkplot.Clf() thinkplot.Cdfs(cdfs) thinkplot.Save(root='variability_cdfs', title='CDF of height', xlabel='Reported height (cm)', ylabel='CDF')
def PlotCdfs(d, labels): """Plot CDFs for each sequence in a dictionary. Jitters the data and subtracts away the mean. d: map from key to sequence of values labels: map from key to string label """ thinkplot.Clf() for key, xs in d.items(): mu = thinkbayes2.Mean(xs) xs = thinkbayes2.Jitter(xs, 1.3) xs = [x - mu for x in xs] cdf = thinkbayes2.MakeCdfFromList(xs) thinkplot.Cdf(cdf, label=labels[key]) thinkplot.Show()
def TestCorrelation(cdf): """Tests the correlated generator. Makes sure that the sequence has the right distribution and correlation. """ n = 10000 rho = 0.4 rdt_seq = CorrelatedGenerator(cdf, rho) xs = [rdt_seq.next() for _ in range(n)] rho2 = correlation.SerialCorr(xs) print((rho, rho2)) cdf2 = thinkbayes2.MakeCdfFromList(xs) thinkplot.Cdfs([cdf, cdf2]) thinkplot.Show()