class IceCreamHMM(TestCase): def setUp(self): """Initialize Eisner ice cream HMM (J & M, Figure 6.3)""" self.hmm = HMM() # These variables have many aliases. J & M call them π, A, B, Q, and V. # You don't need to use these names, but you do need to provide a way # of initializing them. self.hmm.train( [], initial_probabilities=[.8, .2], # P(Hot, Cold) transition_probabilities=[ [.7, .3], # P(Hot, Cold|Hot) [.4, .6] ], # P(Hot, Cold|Cold) emission_probabilities=[ [.2, .4, .4], # P(1, 2, 3|Hot) [.5, .4, .1] ], # P(1, 2, 3|Cold) states=("Hot", "Cold"), vocabulary=(1, 2, 3)) def test_likelihood(self): """Test likelihood for Eisner ice cream HMM (J & M, Figure 6.7)""" # Figure 6.7 of J & M (slide 15 of Lecture6_Handout.pdf, 2014-10-15) # has a known erratum in the computation of alpha_2(2): .7*.2 = .14, # not .014. self.assertAlmostEqual(self.hmm.likelihood(IceCreamCones( [3, 1])), (.32 * .14 + .02 * .08) + (.32 * .15 + .02 * .30)) def test_decoding(self): """Test decoding of Eisner ice cream HMM (J & M, Section 6.4)""" # The same error occurs in Figure 6.10, but the value given for the # Viterbi variable v_2(2) is .0448, which is correct (as you should # verify manually and perhaps add a test for here). self.assertAlmostEqual( self.hmm.classify(IceCreamCones([3, 1, 3]), Test=True, T=1)[0], max(.32 * .14, .02 * .08)) self.assertAlmostEqual( self.hmm.classify(IceCreamCones([3, 1, 3]), Test=True, T=1)[1], max(.32 * .15, .02 * .30)) #Test the value at time step 2 self.assertEqual(self.hmm.classify(IceCreamCones([3, 1, 3])), ["Hot", "Hot", "Hot"])
class TagHMM(TestCase): """Train and test an HMM POS tagger.""" def setUp(self): self.train, self.test = self.split_sents() self.hmm = HMM() self.hmm.train(self.train) def split_sents(self, train=0.95, total=3500, document_class=TaggedSentence): sents = tagged_corpus.tagged_sents()[:total] total = len(sents) if total is None else total i = int(round(train * total)) j = i + int(round(total - train * total)) return (map(document_class, sents[0:i]), map(document_class, sents[i:j])) def accuracy(self, test_sents, verbose=sys.stderr): """Compute accuracy of the HMM tagger on the given sentences.""" total = correct = 0 for sent in test_sents: tags = self.hmm.classify(sent) total += len(tags) for guess, tag in zip(tags, sent.label): correct += (guess == tag) if verbose: print >> verbose, "%.2d%% " % (100 * correct / total), return correct / total @skip("too slow") def test_tag_train(self): """Tag the training data""" self.assertGreater(self.accuracy(self.train), 0.85) def test_tag(self): """Tag the test data""" self.assertGreater(self.accuracy(self.test), 0.85)
if __name__ == "__main__": hmm = HMM() hmm.train( [], initial_probabilities=[.5, .5], # P(non-coding, Coding) transition_probabilities=[ [.95, .05], # P(Hot, Cold|Hot) [.15, .85] ], # P(Hot, Cold|Cold) emission_probabilities=[ [.4, .1, .1, .4], # P(1, 2, 3|Hot) [.2, .3, .3, .2] ], # P(1, 2, 3|Cold) states=("1", "2"), #noncoding,coding vocabulary=('A', 'C', 'G', 'T')) print hmm.likelihood(Gene(['T', 'G', 'C', 'A'])) print hmm.classify( Gene([ 'G', 'C', 'G', 'C', 'A', 'T', 'T', 'A', 'A', 'T', 'C', 'G', 'T', 'C', 'G', 'T', 'C', 'G', 'T', 'A', 'G', 'T', 'T', 'C', 'C', 'T', 'T' ])) print hmm.classify( Gene([ 'G', 'C', 'G', 'C', 'A', 'T', 'T', 'A', 'A', 'T', 'C', 'G', 'T', 'C', 'G', 'G', 'T', 'C', 'G', 'T', 'A', 'G', 'T', 'T', 'C', 'C', 'T', 'T' ]))