Example #1
0
    def test_decodehmm(self):
        hmm = hw5.NamedEntityRecognitionHMM()
        hmm.train(self.train_tups)
        test = [w[0] for w in self.train_tups[0]]
        probs, pointers = hmm.generate_probabilities(test)

        # find the actual labels
        labels = hw5.decode(test, probs, pointers)
        label_answers = [('Comparison', 'O'), ('in', 'O'), ('alkaline', 'B'), ('phosphatases', 'I'), ('in', 'O'), ('5', 'B'), ('-', 'I'), ('nucleotidase', 'I'), ('.', 'O')]
        self.assertEqual(label_answers, labels)
    def test_hmmprobspointers(self):
        hmm = hw5.NamedEntityRecognitionHMM()
        hmm.train(self.train_tups)
        test = [w[0] for w in self.train_tups[0]]
        probs, pointers = hmm.generate_probabilities(test)
        # correct shape
        self.assertEqual(9, len(probs))
        self.assertEqual(9, len(pointers))
        for row in probs:
            self.assertEqual(3, len(row))
            self.assertTrue(type(row) is dict)
            self.assertTrue(type(row['O'] is float))

        for row in pointers:
            self.assertEqual(3, len(row))
            self.assertTrue(type(row) is dict)
            self.assertTrue(type(row['O'] is str))

        # now we'll test the actual values
        # pi = 0, .5, .5 (I, O, B)
        vocab = 13
        p_Comparison_I = (0 + 1) / (5 + vocab)
        p_Comparison_0 = (1 + 1) / (8 + vocab)
        p_Comparison_B = (0 + 1) / (3 + vocab)
        # WARNING THIS CODE DOES NOT FULLY TEST THE PROBABILITIES, ONLY
        # THE FIRST TWO COLUMNS
        column1 = {
            "I": 0 * (p_Comparison_I),
            "O": 0.5 * (p_Comparison_0),
            "B": 0.5 * (p_Comparison_B)
        }
        self.assertEqual(column1, probs[0])

        p_in_I = (0 + 1) / (5 + vocab)
        p_in_O = (3 + 1) / (8 + vocab)
        p_in_B = (0 + 1) / (3 + vocab)

        p_O_O = 4 / 8
        p_O_I = 3 / 5
        p_O_B = 0 / 3

        p_I_O = 0 / 8
        p_I_I = 2 / 5
        p_I_B = 3 / 3

        p_B_O = 2 / 8
        p_B_I = 0 / 5
        p_B_B = 0 / 3
        # the "missing" 2 for the O states are because the sentences end with O,
        # so these are accounted for in the distribution of pi

        prev_I_and_I = p_in_I * p_I_I * column1["I"]
        prev_O_and_I = p_in_I * p_I_O * column1["O"]
        prev_B_and_I = p_in_I * p_I_B * column1["B"]

        prev_I_and_O = p_in_O * p_O_I * column1["I"]
        prev_O_and_O = p_in_O * p_O_O * column1["O"]
        prev_B_and_O = p_in_O * p_O_B * column1["B"]

        prev_I_and_B = p_in_B * p_B_I * column1["I"]
        prev_O_and_B = p_in_B * p_B_O * column1["O"]
        prev_B_and_B = p_in_B * p_B_B * column1["B"]

        I_val = max(prev_I_and_I, prev_O_and_I, prev_B_and_I)
        O_val = max(prev_I_and_O, prev_O_and_O, prev_B_and_O)
        B_val = max(prev_I_and_B, prev_O_and_B, prev_B_and_B)

        self.assertAlmostEqual(I_val, probs[1]["I"])
        self.assertAlmostEqual(O_val, probs[1]["O"])
        self.assertAlmostEqual(B_val, probs[1]["B"])

        # ensure that back pointers are correct
        point_answers = [{
            'O': None,
            'I': None,
            'B': None
        }, {
            'O': 'O',
            'I': 'B',
            'B': 'O'
        }, {
            'O': 'O',
            'I': 'B',
            'B': 'O'
        }, {
            'O': 'O',
            'I': 'B',
            'B': 'O'
        }, {
            'O': 'I',
            'I': 'I',
            'B': 'O'
        }, {
            'O': 'O',
            'I': 'I',
            'B': 'O'
        }, {
            'O': 'O',
            'I': 'B',
            'B': 'O'
        }, {
            'O': 'I',
            'I': 'I',
            'B': 'O'
        }, {
            'O': 'I',
            'I': 'I',
            'B': 'O'
        }]
        self.assertEqual(point_answers, pointers)