def testKaplanMeier(self): complete = [1,3,6] ongoing = [2,3,5,7] pmf_complete = Pmf.from_seq(complete, normalize=False) pmf_ongoing = Pmf.from_seq(ongoing, normalize=False) res = pmf_complete + pmf_ongoing self.assertListEqual(list(res), [1,1,2,1,1,1]) res = pmf_complete - pmf_ongoing self.assertListEqual(list(res), [1.0, -1.0, 0.0, -1.0, 1.0, -1.0]) res = pmf_complete * pmf_ongoing self.assertListEqual(list(res), [0.0, 0.0, 1.0, 0.0, 0.0, 0.0]) res = pmf_complete / pmf_ongoing self.assertListEqual(list(res), [np.inf, 0.0, 1.0, 0.0, np.inf, 0.0]) surv_complete = pmf_complete.make_surv() surv_ongoing = pmf_ongoing.make_surv() done = pmf_complete + pmf_ongoing s1 = surv_complete(done.index) self.assertListEqual(list(s1), [2., 2., 1., 1., 0., 0.]) s2 = surv_ongoing(done.index) self.assertListEqual(list(s2), [4., 3., 2., 1., 1., 0.]) at_risk = done + s1 + s2 self.assertListEqual(list(at_risk), [7.0, 6.0, 5.0, 3.0, 2.0, 1.0]) haz = pmf_complete / at_risk self.assertListEqual(list(haz), [0.14285714285714285, 0.0, 0.2, 0.0, 0.5, 0.0])
def testComparison(self): pmf1 = Pmf.from_seq([1, 2, 3, 4, 5, 6]) pmf2 = Pmf.from_seq([1, 2, 3, 4]) self.assertAlmostEqual(pmf1.eq_dist(3), 1 / 6) self.assertAlmostEqual(pmf1.ne_dist(3), 5 / 6) self.assertAlmostEqual(pmf1.gt_dist(3), 3 / 6) self.assertAlmostEqual(pmf1.ge_dist(3), 4 / 6) self.assertAlmostEqual(pmf1.lt_dist(3), 2 / 6) self.assertAlmostEqual(pmf1.le_dist(3), 3 / 6) self.assertAlmostEqual(pmf1.eq_dist(pmf2), 1 / 6) self.assertAlmostEqual(pmf1.ne_dist(pmf2), 5 / 6) self.assertAlmostEqual(pmf1.gt_dist(pmf2), 0.5833333) self.assertAlmostEqual(pmf1.ge_dist(pmf2), 3 / 4) self.assertAlmostEqual(pmf1.lt_dist(pmf2), 1 / 4) self.assertAlmostEqual(pmf1.le_dist(pmf2), 0.41666666) self.assertAlmostEqual(pmf1.prob_eq(3), 1 / 6) self.assertAlmostEqual(pmf1.prob_ne(3), 5 / 6) self.assertAlmostEqual(pmf1.prob_gt(3), 3 / 6) self.assertAlmostEqual(pmf1.prob_ge(3), 4 / 6) self.assertAlmostEqual(pmf1.prob_lt(3), 2 / 6) self.assertAlmostEqual(pmf1.prob_le(3), 3 / 6) self.assertAlmostEqual(pmf1.prob_eq(pmf2), 1 / 6) self.assertAlmostEqual(pmf1.prob_ne(pmf2), 5 / 6) self.assertAlmostEqual(pmf1.prob_gt(pmf2), 0.5833333) self.assertAlmostEqual(pmf1.prob_ge(pmf2), 3 / 4) self.assertAlmostEqual(pmf1.prob_lt(pmf2), 1 / 4) self.assertAlmostEqual(pmf1.prob_le(pmf2), 0.41666666)
def testMinMax(self): pmf = Pmf.from_seq([1, 2, 3]) pmf2 = pmf.max_dist(2) ans = Pmf([1 / 9, 3 / 9, 5 / 9], pmf.index) self.almost_equal_dist(pmf2, ans) pmf3 = pmf.min_dist(2) ans = Pmf([5 / 9, 3 / 9, 1 / 9], pmf.index) self.almost_equal_dist(pmf3, ans)
def distribution_of_income(brfss): # Extract income income = brfss["INCOME2"] # Plot the PMF Pmf.from_seq(income).bar() # Label the axes plt.xlabel('Income level') plt.ylabel('PMF') plt.show()
def testAdd(self): pmf1 = Pmf.from_seq([1, 2, 3, 4, 5, 6]) pmf2 = Pmf.from_seq([1, 2, 3, 4]) total = pmf1 + pmf2 total.normalize() self.assertAlmostEqual(total.mean(), 3) total = pmf1.add(pmf2) total.normalize() self.assertAlmostEqual(total.mean(), 3)
def pmf_from_dist(dist, qs): """Make a discrete approximation. dist: SciPy distribution object qs: quantities returns: Pmf """ ps = dist.pdf(qs) pmf = Pmf(ps, qs) pmf.normalize() return pmf
def make_poisson_pmf(lam, qs): """Make a PMF of a Poisson distribution. lam: event rate qs: sequence of values for `k` returns: Pmf """ ps = poisson(lam).pmf(qs) pmf = Pmf(ps, qs) pmf.normalize() return pmf
def kde_from_sample(sample, qs): """Make a kernel density estimate from a sample sample: sequence of values qs: quantities where we should evaluate the KDE returns: normalized Pmf """ kde = gaussian_kde(sample) ps = kde(qs) pmf = Pmf(ps, qs) pmf.normalize() return pmf
def plot_pmf(T, S): pmfT = Pmf.from_seq(T) pmfS = Pmf.from_seq(S) fig = plt.figure(figsize=(10, 5)) plt.subplot(1, 2, 1) pmfT.plot(xlim=(0, 50), xlabel="Avalanche duration", ylabel="PMF") plt.subplot(1, 2, 2) pmfS.plot(xlim=(0, 50), xlabel="Avalanche size", ylabel="PMF") plt.show('PMF size and duration') fig.savefig("pmf_plot.png")
def make_uniform(qs, name=None, **options): """Make a Pmf that represents a uniform distribution. qs: quantities name: string name for the quantities options: passed to Pmf returns: Pmf """ pmf = Pmf(1.0, qs, **options) pmf.normalize() if name: pmf.index.name = name return pmf
def kde_from_pmf(pmf, n=101): """Make a kernel density estimate from a Pmf. pmf: Pmf object n: number of points returns: Pmf object """ kde = gaussian_kde(pmf.qs, weights=pmf.ps) qs = np.linspace(pmf.qs.min(), pmf.qs.max(), n) ps = kde.evaluate(qs) pmf = Pmf(ps, qs) pmf.normalize() return pmf
def pmf_from_dist(dist, low, high): """Make a discrete approximation of a continuous distribution. dist: any SciPy distribution object low: low end of range high: high end of range returns: normalized Pmf """ qs = np.linspace(low, high, 101) ps = dist.pdf(qs) pmf = Pmf(ps, qs) pmf.normalize() return pmf
def testSort(self): t = list('allen') pmf = Pmf.from_seq(t, sort=False) pmf.sort_index(inplace=True) self.assertEqual(pmf.qs[0], 'a') self.assertEqual(pmf.qs[-1], 'n') cdf = pmf.make_cdf() self.assertEqual(cdf.qs[0], 'a') self.assertEqual(cdf.qs[-1], 'n') # currently Pmf.from_seq sorts numerical sort_values # regardless of the sort keyword pmf = Pmf.from_seq([3, 6, 1, 7, 2], sort=False) self.assertEqual(pmf.qs[0], 1)
def test_joint(self): pmf1 = Pmf.from_seq([1, 2, 2]) pmf2 = Pmf.from_seq([1, 2, 3]) joint = Pmf.make_joint(pmf1, pmf2) mar1 = joint.marginal(0) mar2 = joint.marginal(1) self.assertAlmostEqual(mar1.mean(), pmf1.mean()) self.assertAlmostEqual(mar2.mean(), pmf2.mean()) cond1 = joint.conditional(0, 1, 1) cond2 = joint.conditional(1, 0, 1) self.assertAlmostEqual(cond1.mean(), pmf1.mean()) self.assertAlmostEqual(cond2.mean(), pmf2.mean())
def kde_from_pmf(pmf, n=101, **options): """Make a kernel density estimate from a Pmf. pmf: Pmf object n: number of points returns: Pmf object """ # TODO: should this take qs rather than use min-max? kde = gaussian_kde(pmf.qs, weights=pmf.ps) qs = np.linspace(pmf.qs.min(), pmf.qs.max(), n) ps = kde.evaluate(qs) pmf = Pmf(ps, qs, **options) pmf.normalize() return pmf
def testStats(self): pmf = Pmf.from_seq([1, 2, 3, 4, 5, 6]) self.assertAlmostEqual(pmf.mean(), 3.5) self.assertAlmostEqual(pmf.var(), 2.91666666) self.assertAlmostEqual(pmf.std(), 1.70782512) self.assertAlmostEqual(pmf.median(), 3) self.assertAlmostEqual(pmf.quantile(0.8), 5) cdf = pmf.make_cdf() self.assertAlmostEqual(cdf.mean(), 3.5) self.assertAlmostEqual(cdf.var(), 2.91666666) self.assertAlmostEqual(cdf.std(), 1.70782512) self.assertAlmostEqual(cdf.median(), 3) self.assertAlmostEqual(cdf.quantile(0.8), 5) surv = pmf.make_surv() self.assertAlmostEqual(surv.mean(), 3.5) self.assertAlmostEqual(surv.var(), 2.91666666) self.assertAlmostEqual(surv.std(), 1.70782512) self.assertAlmostEqual(surv.median(), 3) self.assertAlmostEqual(surv.quantile(0.8), 5) haz = pmf.make_hazard() self.assertAlmostEqual(haz.mean(), 3.5) self.assertAlmostEqual(haz.var(), 2.91666666) self.assertAlmostEqual(haz.std(), 1.70782512) self.assertAlmostEqual(haz.median(), 3) self.assertAlmostEqual(haz.quantile(0.8), 5) haz = cdf.make_hazard() self.assertAlmostEqual(haz.mean(), 3.5) self.assertAlmostEqual(haz.var(), 2.91666666) self.assertAlmostEqual(haz.std(), 1.70782512) self.assertAlmostEqual(haz.median(), 3) self.assertAlmostEqual(haz.quantile(0.8), 5)
def testHead(self): pmf1 = Pmf.from_seq([1, 2, 3, 4, 5, 6]) h = pmf1.head() self.assertEqual(type(h), type(pmf1)) cdf1 = pmf1.make_cdf() h = cdf1.head() self.assertEqual(type(h), type(cdf1))
def testCredible(self): t = np.arange(101) pmf = Pmf.from_seq(t) cdf = pmf.make_cdf() ci = pmf.credible_interval(0.9) self.assertListEqual(list(ci), [5, 95]) ci = cdf.credible_interval(0.9) self.assertListEqual(list(ci), [5, 95])
def make_uniform(start, stop, num=51, name=None, **options): """Make a Pmf that represents a uniform distribution. start: lower bound stop: upper bound num: number of points name: string name for the quantities options: passed to Pmf returns: Pmf """ qs = np.linspace(start, stop, num) pmf = Pmf(1.0, qs, **options) pmf.normalize() if name: pmf.index.name = name return pmf
def pmf_marginal(joint_pmf, level): """Compute a marginal distribution. joint_pmf: Pmf representing a joint distribution level: int, level to sum along returns: Pmf """ return Pmf(joint_pmf.sum(level=level))
def make_die(sides): """Pmf that represents a die with the given number of sides. sides: int returns: Pmf """ outcomes = np.arange(1, sides + 1) die = Pmf(1 / sides, outcomes) return die
def make_binomial(n, p): """Make a binomial distribution. n: number of trials p: probability of success returns: Pmf representing the distribution of k """ ks = np.arange(n + 1) ps = binom.pmf(ks, n, p) return Pmf(ps, ks)
def marginal(joint, axis): """Compute a marginal distribution. axis=1 returns the marginal distribution of the first variable axis=0 returns the marginal distribution of the second variable joint: DataFrame representing a joint distribution axis: int axis to sum along returns: Pmf """ return Pmf(joint.sum(axis=axis))
def pmf_of_age(brfss): # Extract age age = brfss["AGE"] # Plot the PMF pmf_age = Pmf.from_seq(age) pmf_age.bar() # Label the axes plt.xlabel('Age in years') plt.ylabel('PMF') plt.show()
def testPmfSampling(self): pmf = Pmf.from_seq([1, 2, 3, 4, 5, 6]) expected = [2, 4, 2, 1, 5, 4, 4, 4, 1, 3] # test choice np.random.seed(17) a = pmf.choice(10) self.assertTrue(np.all((a == expected))) # test sample a = pmf.sample(10, replace=True, random_state=17) self.assertTrue(np.all((a == expected)))
def testCopy(self): t = [1, 2, 2, 3, 5] pmf = Pmf.from_seq(t) pmf2 = pmf.copy() for x in pmf.qs: self.assertAlmostEqual(pmf[x], pmf2[x]) cdf = pmf.make_cdf() cdf2 = cdf.copy() for x in cdf.qs: self.assertAlmostEqual(cdf[x], cdf2[x])
def testNormalize(self): t = [0, 1, 2, 3, 3, 4, 4, 4, 5] pmf = Pmf.from_seq(t, normalize=False) total = pmf.normalize() self.assertAlmostEqual(total, 9) self.assertAlmostEqual(pmf[3], 0.22222222) cdf = Cdf.from_seq(t, normalize=False) total = cdf.normalize() self.assertAlmostEqual(total, 9) self.assertAlmostEqual(cdf(3), 0.55555555)
def make_mixture(pmf, pmf_seq): """Make a mixture of distributions. pmf: mapping from each hypothesis to its probability pmf_seq: sequence of Pmfs, each representing a conditional distribution for one hypothesis returns: Pmf representing the mixture """ df = pd.DataFrame(pmf_seq).fillna(0).transpose() df *= pmf.ps total = df.sum(axis=1) return Pmf(total)
def make_a_pmf(gss): # Select the age column age = gss['age'].values # Make a PMF of age pmf_age = Pmf.from_seq(age) # Plot the PMF pmf_age.bar() # Label the axes plt.xlabel('Age') plt.ylabel('PMF') plt.show()
def compare_fb_to_ws(): """Plots Facebook network data vs. Watts-Strogatz """ dirname = '/Users/bensmith/Documents/ThinkSeries/ThinkComplexity2/data/' fin = dirname + 'facebook_combined.txt.gz' fb = read_graph(fin) print('Facebook') n, m, k, degs = analyze_graph(fb) pmf_fb = Pmf.from_seq(degs) x = 25 print('fewer than %i friends: %.3f' %(x, cumulative_prob(pmf_fb, x))) ws = nx.watts_strogatz_graph(n, k, 0.05, seed=15) print('Watts-Strogatz') n, m, k, degs = analyze_graph(ws) pmf_ws = Pmf.from_seq(degs) plt.figure(figsize=(8,4)) options = dict(ls='', marker='.') plt.subplot(1,2,1) plt.plot([20, 1000], [5e-2, 2e-4], color='gray', linestyle='dashed') pmf_fb.plot(label='Facebook', color='C0', **options) decorate(xlabel='Degree', ylabel='PMF', xscale='log', yscale='log') plt.subplot(1,2,2) pmf_ws.plot(label='WS graph', color='C1', **options) decorate(xlabel='Degree', xscale='log', yscale='log') savefig('myfigs/chap04-1') plt.show()