def test_decodehmm(self): hmm = hw5.NamedEntityRecognitionHMM() hmm.train(self.train_tups) test = [w[0] for w in self.train_tups[0]] probs, pointers = hmm.generate_probabilities(test) # find the actual labels labels = hw5.decode(test, probs, pointers) label_answers = [('Comparison', 'O'), ('in', 'O'), ('alkaline', 'B'), ('phosphatases', 'I'), ('in', 'O'), ('5', 'B'), ('-', 'I'), ('nucleotidase', 'I'), ('.', 'O')] self.assertEqual(label_answers, labels)
def test_hmmprobspointers(self): hmm = hw5.NamedEntityRecognitionHMM() hmm.train(self.train_tups) test = [w[0] for w in self.train_tups[0]] probs, pointers = hmm.generate_probabilities(test) # correct shape self.assertEqual(9, len(probs)) self.assertEqual(9, len(pointers)) for row in probs: self.assertEqual(3, len(row)) self.assertTrue(type(row) is dict) self.assertTrue(type(row['O'] is float)) for row in pointers: self.assertEqual(3, len(row)) self.assertTrue(type(row) is dict) self.assertTrue(type(row['O'] is str)) # now we'll test the actual values # pi = 0, .5, .5 (I, O, B) vocab = 13 p_Comparison_I = (0 + 1) / (5 + vocab) p_Comparison_0 = (1 + 1) / (8 + vocab) p_Comparison_B = (0 + 1) / (3 + vocab) # WARNING THIS CODE DOES NOT FULLY TEST THE PROBABILITIES, ONLY # THE FIRST TWO COLUMNS column1 = { "I": 0 * (p_Comparison_I), "O": 0.5 * (p_Comparison_0), "B": 0.5 * (p_Comparison_B) } self.assertEqual(column1, probs[0]) p_in_I = (0 + 1) / (5 + vocab) p_in_O = (3 + 1) / (8 + vocab) p_in_B = (0 + 1) / (3 + vocab) p_O_O = 4 / 8 p_O_I = 3 / 5 p_O_B = 0 / 3 p_I_O = 0 / 8 p_I_I = 2 / 5 p_I_B = 3 / 3 p_B_O = 2 / 8 p_B_I = 0 / 5 p_B_B = 0 / 3 # the "missing" 2 for the O states are because the sentences end with O, # so these are accounted for in the distribution of pi prev_I_and_I = p_in_I * p_I_I * column1["I"] prev_O_and_I = p_in_I * p_I_O * column1["O"] prev_B_and_I = p_in_I * p_I_B * column1["B"] prev_I_and_O = p_in_O * p_O_I * column1["I"] prev_O_and_O = p_in_O * p_O_O * column1["O"] prev_B_and_O = p_in_O * p_O_B * column1["B"] prev_I_and_B = p_in_B * p_B_I * column1["I"] prev_O_and_B = p_in_B * p_B_O * column1["O"] prev_B_and_B = p_in_B * p_B_B * column1["B"] I_val = max(prev_I_and_I, prev_O_and_I, prev_B_and_I) O_val = max(prev_I_and_O, prev_O_and_O, prev_B_and_O) B_val = max(prev_I_and_B, prev_O_and_B, prev_B_and_B) self.assertAlmostEqual(I_val, probs[1]["I"]) self.assertAlmostEqual(O_val, probs[1]["O"]) self.assertAlmostEqual(B_val, probs[1]["B"]) # ensure that back pointers are correct point_answers = [{ 'O': None, 'I': None, 'B': None }, { 'O': 'O', 'I': 'B', 'B': 'O' }, { 'O': 'O', 'I': 'B', 'B': 'O' }, { 'O': 'O', 'I': 'B', 'B': 'O' }, { 'O': 'I', 'I': 'I', 'B': 'O' }, { 'O': 'O', 'I': 'I', 'B': 'O' }, { 'O': 'O', 'I': 'B', 'B': 'O' }, { 'O': 'I', 'I': 'I', 'B': 'O' }, { 'O': 'I', 'I': 'I', 'B': 'O' }] self.assertEqual(point_answers, pointers)