def __init__(self, *al, **kw): super(CookieProblem, self).__init__(*al, **kw) self.hypotheses = dict( bowl_1=PMF(vanilla=30, chocolate=10), bowl_2=PMF(vanilla=20, chocolate=20), ) self.uniform_dist(self.hypotheses)
def test_add_two_independent_pmfs(self): left_pmf, right_pmf = PMF(), PMF() left_pmf.uniform_dist((0, 1)) right_pmf.uniform_dist((0, 1)) sum_pmf = add_two_independent_pmfs(left_pmf, right_pmf) self.assertTrue(0.249 < sum_pmf[0] < 0.251) self.assertTrue(0.499 < sum_pmf[1] < 0.501) self.assertTrue(0.249 < sum_pmf[2] < 0.251)
def test_filter_possible_events(self): pmf = PMF() pmf.uniform_dist("abcdef") pmf["f"] = 0 filtered_pmf = filter_possible_events(pmf) for x in "abcde": self.assertTrue(x in filtered_pmf) self.assertTrue(x in filtered_pmf)
def test_iadd_pmfs(self): left_pmf, right_pmf = PMF(), PMF() left_pmf.uniform_dist((0, 1)) right_pmf.uniform_dist((0, 1)) left_pmf += right_pmf self.assertTrue(0.249 < left_pmf[0] < 0.251) self.assertTrue(0.499 < left_pmf[1] < 0.501) self.assertTrue(0.249 < left_pmf[2] < 0.251)
def test_filter_possible_events(self): pmf = PMF() pmf.uniform_dist('abcdef') pmf['f'] = 0 filtered_pmf = filter_possible_events(pmf) for x in 'abcde': self.assertTrue(x in filtered_pmf) self.assertTrue(x in filtered_pmf)
def test_random_from_uniform_dist(self): # Test is probabilistic, which is not great, but is necessary. simulation_pmf = PMF() for n in range(10000): x = self.pmf.random() hit_count = simulation_pmf.get(x, 0) simulation_pmf[x] = hit_count + 1 simulation_pmf.normalize() for x in "abcde": self.assertTrue(0.190 < simulation_pmf[x] < 0.210)
def test_random_from_uniform_dist(self): # Test is probabilistic, which is not great, but is necessary. simulation_pmf = PMF() for n in range(10000): x = self.pmf.random() hit_count = simulation_pmf.get(x, 0) simulation_pmf[x] = hit_count + 1 simulation_pmf.normalize() for x in 'abcde': self.assertTrue(0.190 < simulation_pmf[x] < 0.210)
def test_random_from_power_dist(self): # Test is probabilistic, which is not great, but is necessary. self.pmf.power_law_dist(xrange(1, 4)) simulation_pmf = PMF() for n in range(10000): x = self.pmf.random() hit_count = simulation_pmf.get(x, 0) simulation_pmf[x] = hit_count + 1 simulation_pmf.normalize() self.assertTrue(0.540 < simulation_pmf[1] < 0.550) self.assertTrue(0.262 < simulation_pmf[2] < 0.283) self.assertTrue(0.166 < simulation_pmf[3] < 0.197)
def test_sum_three_pmfs(self): pmfs = [PMF.fromkeys((0, 1), 0.5) for n in range(3)] sum_pmf = sum_independent_pmfs(pmfs) self.assertTrue(0.124 < sum_pmf[0] < 0.126) self.assertTrue(0.374 < sum_pmf[1] < 0.376) self.assertTrue(0.374 < sum_pmf[2] < 0.376) self.assertTrue(0.124 < sum_pmf[3] < 0.126)
def test_cookie_problem_with_arbitrary_factors(self): """ test_cookie_problem_with_arbitrary_factors (irrealis_bayes.tests.FunctionalTestPMF) Can multiply dictionary by any convenient factor, as long as the whole dictionary is multiplied by that factor. We later normalize to get a probability distribution. """ # One "bowl_1" and one "bowl_2". pmf = PMF(bowl_1=1, bowl_2=1) # Thirty vanilla cookies (out of forty) in bowl_1. pmf["bowl_1"] *= 30 # Twenty vanilla cookies (out of forty) in bowl_2. pmf["bowl_2"] *= 20 # This normalizes dictionary into a probability distribution. pmf.normalize() self.assertTrue(0.599 < pmf["bowl_1"] < 0.601)
def test_cookie_problem_with_arbitrary_factors(self): ''' test_cookie_problem_with_arbitrary_factors (irrealis_bayes.tests.FunctionalTestPMF) Can multiply dictionary by any convenient factor, as long as the whole dictionary is multiplied by that factor. We later normalize to get a probability distribution. ''' # One "bowl_1" and one "bowl_2". pmf = PMF(bowl_1=1, bowl_2=1) # Thirty vanilla cookies (out of forty) in bowl_1. pmf['bowl_1'] *= 30 # Twenty vanilla cookies (out of forty) in bowl_2. pmf['bowl_2'] *= 20 # This normalizes dictionary into a probability distribution. pmf.normalize() self.assertTrue(0.599 < pmf['bowl_1'] < 0.601)
def __init__(self, *al, **kw): super(CookieProblem, self).__init__(*al, **kw) # These encode the initial state of the bowls. bowl_1 = PMF(vanilla=30, chocolate=10) bowl_2 = PMF(vanilla=20, chocolate=20) self.hypotheses = dict( # The states of the different hypotheses mustn't depend on each # other, so each hypothesis gets its own copy of the initial state. A=dict(bowl_a=bowl_1.copy(), bowl_b=bowl_2.copy()), B=dict(bowl_a=bowl_2.copy(), bowl_b=bowl_1.copy()), ) self.uniform_dist(self.hypotheses)
def __init__(self, *al, **kw): super(MnMProblem, self).__init__(*al, **kw) mix94 = PMF(brown=30, yellow=20, red=20, green=10, orange=10, tan=10) mix96 = PMF(blue=24, green=20, orange=16, yellow=14, red=13, brown=13) self.hypotheses = dict( A=dict(bag_1=mix94, bag_2=mix96), B=dict(bag_1=mix96, bag_2=mix94), ) self.uniform_dist(self.hypotheses)
class TestCDF(unittest.TestCase): def setUp(self): self.pmf = PMF() self.pmf.uniform_dist('abcde') self.cdf = CDF(self.pmf) def test_percentile(self): self.assertEqual('a', self.cdf.percentile(0.0)) self.assertEqual('a', self.cdf.percentile(0.1)) self.assertEqual('a', self.cdf.percentile(0.2)) self.assertEqual('b', self.cdf.percentile(0.3)) self.assertEqual('b', self.cdf.percentile(0.4)) self.assertEqual('c', self.cdf.percentile(0.5)) self.assertEqual('c', self.cdf.percentile(0.6)) self.assertEqual('d', self.cdf.percentile(0.7)) self.assertEqual('d', self.cdf.percentile(0.8)) self.assertEqual('e', self.cdf.percentile(0.9)) self.assertEqual('e', self.cdf.percentile(1.0)) def test_percentiles(self): self.assertEqual(('b', 'd'), self.cdf.percentiles(0.3, 0.8))
class TestCDF(unittest.TestCase): def setUp(self): self.pmf = PMF() self.pmf.uniform_dist("abcde") self.cdf = CDF(self.pmf) def test_percentile(self): self.assertEqual("a", self.cdf.percentile(0.0)) self.assertEqual("a", self.cdf.percentile(0.1)) self.assertEqual("a", self.cdf.percentile(0.2)) self.assertEqual("b", self.cdf.percentile(0.3)) self.assertEqual("b", self.cdf.percentile(0.4)) self.assertEqual("c", self.cdf.percentile(0.5)) self.assertEqual("c", self.cdf.percentile(0.6)) self.assertEqual("d", self.cdf.percentile(0.7)) self.assertEqual("d", self.cdf.percentile(0.8)) self.assertEqual("e", self.cdf.percentile(0.9)) self.assertEqual("e", self.cdf.percentile(1.0)) def test_percentiles(self): self.assertEqual(("b", "d"), self.cdf.percentiles(0.3, 0.8))
def test_basic_cookie_problem(self): """ test_basic_cookie_problem (irrealis_bayes.tests.FunctionalTestPMF) From Think Bayes: Suppose there are two bowls of cookies. The first bowl contains 30 vanilla cookies and ten chocolate cookies. The second bowl contains twenty of each. Now suppose you choose one of the bowls at random and, without looking, select a cookie from bowl at random. The cookie is vanilla. What is the probability that it came from the first bowl? Prior to choosing the cookie, the probability P(bowl_1) of choosing the first bowl was 0.5 (since we were equally likely to choose either bowl). Assuming we had chosen the first bowl, the likelihood P(vanilla | bowl_1) of choosing a vanilla cookie was 0.75 (30 vanilla cookies out a total of forty cookies in the first bowl). On the other hand, assuming we had chosen the second bowl, the likelihood P(vanilla | bowl_2) of choosing a vanilla cookie was 0.5 (twenty vanilla cookies out of 40 cookies in the second bowl). Since our hypotheses (bowl one or bowl two) are exclusive and exhaustive, the law of total probability gives: P(bowl_1 | vanilla) = (P(bowl_1)*P(vanilla | bowl_1)) / (P(bowl_1)*P(vanilla | bowl_1) + P(bowl_2)*P(vanilla | bowl_2)) = (0.5*0.75)/(0.5*0.75 + 0.5*0.5) = (0.75)/(0.75 + 0.5) = 0.6 """ pmf = PMF(bowl_1=0.5, bowl_2=0.5) pmf["bowl_1"] *= 0.75 pmf["bowl_2"] *= 0.5 pmf.normalize() self.assertTrue(0.599 < pmf["bowl_1"] < 0.601)
def test_basic_cookie_problem(self): ''' test_basic_cookie_problem (irrealis_bayes.tests.FunctionalTestPMF) From Think Bayes: Suppose there are two bowls of cookies. The first bowl contains 30 vanilla cookies and ten chocolate cookies. The second bowl contains twenty of each. Now suppose you choose one of the bowls at random and, without looking, select a cookie from bowl at random. The cookie is vanilla. What is the probability that it came from the first bowl? Prior to choosing the cookie, the probability P(bowl_1) of choosing the first bowl was 0.5 (since we were equally likely to choose either bowl). Assuming we had chosen the first bowl, the likelihood P(vanilla | bowl_1) of choosing a vanilla cookie was 0.75 (30 vanilla cookies out a total of forty cookies in the first bowl). On the other hand, assuming we had chosen the second bowl, the likelihood P(vanilla | bowl_2) of choosing a vanilla cookie was 0.5 (twenty vanilla cookies out of 40 cookies in the second bowl). Since our hypotheses (bowl one or bowl two) are exclusive and exhaustive, the law of total probability gives: P(bowl_1 | vanilla) = (P(bowl_1)*P(vanilla | bowl_1)) / (P(bowl_1)*P(vanilla | bowl_1) + P(bowl_2)*P(vanilla | bowl_2)) = (0.5*0.75)/(0.5*0.75 + 0.5*0.5) = (0.75)/(0.75 + 0.5) = 0.6 ''' pmf = PMF(bowl_1=0.5, bowl_2=0.5) pmf['bowl_1'] *= 0.75 pmf['bowl_2'] *= 0.5 pmf.normalize() self.assertTrue(0.599 < pmf['bowl_1'] < 0.601)
def test_sum_two_pmfs(self): pmfs = [PMF.fromkeys((0, 1), 0.5) for n in range(2)] sum_pmf = sum_independent_pmfs(pmfs) self.assertTrue(0.249 < sum_pmf[0] < 0.251) self.assertTrue(0.499 < sum_pmf[1] < 0.501) self.assertTrue(0.249 < sum_pmf[2] < 0.251)
def test_expectation(self): pmf = PMF.fromkeys((1, 2, 3), 1.0) pmf.normalize() self.assertTrue(1.999 < pmf.expectation() < 2.001)
def test_unimplemented_likelihood_raises(self): pmf = PMF(x=2) with self.assertRaises(NotImplementedError): pmf.update("blah")
def test_german_tank_problem(self): ''' test_german_tank_problem (irrealis_bayes.tests.FunctionalTestPMF) From Think Bayes: During World War II, the Economic Warfare Division of the American Embassy in London used statistical analysis to estimate German production of tanks and other equipment. The Western Allies had captured log books, inventories, and repair records that included chassis and engine serial numbers for individual tanks. Analysis of these records indicated that serial numbers were allocated by manufacturer and tank type in blocks of 100 numbers, that numbers in each block were used sequentially, and that not all numbers in each block were used. So the problem of estimating German tank production could be reduced, within each block of 100 numbers, to a form of the locomotive problem. Based on this insight, American and British analysts produced estimates substantially lower than estimates from other forms of intelligence. And after the war, records indicated that they were substantially more accurate. They performed similar analyses for tires, trucks, rockets, and other equipment, yielding accurate and actionable economic intelligence. The German tank problem is historically intersting; it is also a nice example of real-world application of statistical estimation. Let's try a simplified version of the this problem. Let's assume five producers A, B, C, D produced 10, 30, 100, 300 tanks each in a given time period, and that serial number blocks were allocated and used as follows: Producer allocated Used Subtotal A 0-99 0-9 10 B 100-199 100-129 30 C 200-299 200-242 43 C 300-399 300-356 57 D 400-499 400-465 66 D 500-599 500-583 84 D 600-699 600-670 71 D 700-799 700-778 79 Now let's pretend we don't know how many tanks were made, nor which serial numbers used, and then try to infer the total number of tanks on the basis of serial numbers observed. ''' # First we'll create a distribution for sampling. This distribution will be # uniform over the serial numbers used. serial_number_blocks = ( (0, 9), (100, 129), (200, 242), (300, 356), (400, 465), (500, 583), (600, 670), (700, 778), ) # Make a list of all actual serial numbers. serial_numbers = sum((range(start, end + 1) for (start, end) in serial_number_blocks), []) sampling_dist = PMF() sampling_dist.uniform_dist(serial_numbers) # Pretending we don't know much, we'll assume a set of ten blocks of 100 # serial numbers per block, treating each block as in the locomotive # problem. We'll use a modified power distribution that includes the # hypothesis that zero serial numbers were used in a given block. class LocomotiveProblem(PMF): def likelihood(self, data, given): return 1. / given if 0 <= data < given else 0 pmfs = [LocomotiveProblem() for n in range(10)] for pmf in pmfs: pmf.power_law_dist(range(1, 100)) # The following heavily biases prior distributions toward zero. Have to # renormalize after this hack. #pmf[0] = 100.; pmf.normalize() # Now let's make a bunch of observations, and update our pmfs accordingly. random.seed(0) for n in range(20): observation = sampling_dist.random() pmf_number, pmf_partial_serial_number = divmod(observation, 100) pmf = pmfs[pmf_number] pmf.update(pmf_partial_serial_number) print # First thing we can try is summing expectations. print "sum of expectations:", sum(pmf.expectation() for pmf in pmfs) # Second thing we can try is summing endpoints of credible intervals. I # think that if I want a final 90% credible interval, I need my individual # credible intervals to have probability 0.9**(1./10.). cdfs = [CDF(pmf) for pmf in pmfs] credible_intervals = [cdf.percentiles(0.005, 0.995) for cdf in cdfs] endpoint_arrays = zip(*credible_intervals) summed_credible_interval = [sum(array) for array in endpoint_arrays] print "90% summed_credible_interval:", summed_credible_interval # Third thingwe can try is distribution of sums. sum_pmf = sum_independent_pmfs(pmfs) print "expectation of sum:", sum_pmf.expectation() sum_cdf = CDF(sum_pmf) credible_interval_of_sum = sum_cdf.percentiles(0.05, 0.95) print "90% credible interval of sum:", credible_interval_of_sum credible_interval_of_sum = sum_cdf.percentiles(0.025, 0.975) print "95% credible interval of sum:", credible_interval_of_sum
def setUp(self): self.pmf = PMF() self.pmf.uniform_dist("abcde") self.cdf = CDF(self.pmf)
def test_german_tank_problem(self): """ test_german_tank_problem (irrealis_bayes.tests.FunctionalTestPMF) From Think Bayes: During World War II, the Economic Warfare Division of the American Embassy in London used statistical analysis to estimate German production of tanks and other equipment. The Western Allies had captured log books, inventories, and repair records that included chassis and engine serial numbers for individual tanks. Analysis of these records indicated that serial numbers were allocated by manufacturer and tank type in blocks of 100 numbers, that numbers in each block were used sequentially, and that not all numbers in each block were used. So the problem of estimating German tank production could be reduced, within each block of 100 numbers, to a form of the locomotive problem. Based on this insight, American and British analysts produced estimates substantially lower than estimates from other forms of intelligence. And after the war, records indicated that they were substantially more accurate. They performed similar analyses for tires, trucks, rockets, and other equipment, yielding accurate and actionable economic intelligence. The German tank problem is historically intersting; it is also a nice example of real-world application of statistical estimation. Let's try a simplified version of the this problem. Let's assume five producers A, B, C, D produced 10, 30, 100, 300 tanks each in a given time period, and that serial number blocks were allocated and used as follows: Producer allocated Used Subtotal A 0-99 0-9 10 B 100-199 100-129 30 C 200-299 200-242 43 C 300-399 300-356 57 D 400-499 400-465 66 D 500-599 500-583 84 D 600-699 600-670 71 D 700-799 700-778 79 Now let's pretend we don't know how many tanks were made, nor which serial numbers used, and then try to infer the total number of tanks on the basis of serial numbers observed. """ # First we'll create a distribution for sampling. This distribution will be # uniform over the serial numbers used. serial_number_blocks = ( (0, 9), (100, 129), (200, 242), (300, 356), (400, 465), (500, 583), (600, 670), (700, 778), ) # Make a list of all actual serial numbers. serial_numbers = sum((range(start, end + 1) for (start, end) in serial_number_blocks), []) sampling_dist = PMF() sampling_dist.uniform_dist(serial_numbers) # Pretending we don't know much, we'll assume a set of ten blocks of 100 # serial numbers per block, treating each block as in the locomotive # problem. We'll use a modified power distribution that includes the # hypothesis that zero serial numbers were used in a given block. class LocomotiveProblem(PMF): def likelihood(self, data, given): return 1.0 / given if 0 <= data < given else 0 pmfs = [LocomotiveProblem() for n in range(10)] for pmf in pmfs: pmf.power_law_dist(range(1, 100)) # The following heavily biases prior distributions toward zero. Have to # renormalize after this hack. # pmf[0] = 100.; pmf.normalize() # Now let's make a bunch of observations, and update our pmfs accordingly. random.seed(0) for n in range(20): observation = sampling_dist.random() pmf_number, pmf_partial_serial_number = divmod(observation, 100) pmf = pmfs[pmf_number] pmf.update(pmf_partial_serial_number) print # First thing we can try is summing expectations. print "sum of expectations:", sum(pmf.expectation() for pmf in pmfs) # Second thing we can try is summing endpoints of credible intervals. I # think that if I want a final 90% credible interval, I need my individual # credible intervals to have probability 0.9**(1./10.). cdfs = [CDF(pmf) for pmf in pmfs] credible_intervals = [cdf.percentiles(0.005, 0.995) for cdf in cdfs] endpoint_arrays = zip(*credible_intervals) summed_credible_interval = [sum(array) for array in endpoint_arrays] print "90% summed_credible_interval:", summed_credible_interval # Third thingwe can try is distribution of sums. sum_pmf = sum_independent_pmfs(pmfs) print "expectation of sum:", sum_pmf.expectation() sum_cdf = CDF(sum_pmf) credible_interval_of_sum = sum_cdf.percentiles(0.05, 0.95) print "90% credible interval of sum:", credible_interval_of_sum credible_interval_of_sum = sum_cdf.percentiles(0.025, 0.975) print "95% credible interval of sum:", credible_interval_of_sum
def test_unimplemented_likelihood_raises(self): pmf = PMF(x=2) with self.assertRaises(NotImplementedError): pmf.update('blah')
def setUp(self): self.pmf = PMF() self.pmf.uniform_dist('abcde') self.cdf = CDF(self.pmf)
def test_floats(self): self.pmf = PMF.fromkeys("abcde", 1.0) self.exercise_pmf()
def setUp(self): # Stabilize the random number generator, so test results using random are # consistent. (This has side effects, since state of python RNG is static.) random.seed(0) self.pmf = PMF.fromkeys('abcde', 1)
def test_expectation(self): pmf = PMF.fromkeys((1, 2, 3), 1.) pmf.normalize() self.assertTrue(1.999 < pmf.expectation() < 2.001)
def setUp(self): # Stabilize the random number generator, so test results using random are # consistent. (This has side effects, since state of python RNG is static.) random.seed(0) self.pmf = PMF.fromkeys("abcde", 1)
def test_floats(self): self.pmf = PMF.fromkeys('abcde', 1.) self.exercise_pmf()
def test_zerosum(self): self.pmf = PMF.fromkeys("abcde", 0) self.pmf.normalize() total = sum(self.pmf.itervalues()) # This is how we verify total is 'nan': only 'nan' is not equal to itself. self.assertNotEqual(total, total)
def test_zerosum(self): self.pmf = PMF.fromkeys('abcde', 0) self.pmf.normalize() total = sum(self.pmf.itervalues()) # This is how we verify total is 'nan': only 'nan' is not equal to itself. self.assertNotEqual(total, total)