Example #1
0
	def __init__(self, sampleData, fileCorrecting):
		self.hmm = HMM()
		self.hmm.buildFromData(sampleData)
		self.marginal, self.emission, self.transition = self.hmm.persistHMM()
		self.actualStates, self.observations = self.parseFileCorrecting(fileCorrecting)
		self.beforeAndAfterStates = self.observations
		self.correctedText = None
Example #2
0
class viterbi(object):

	def __init__(self, sampleData, fileCorrecting):
		self.hmm = HMM()
		self.hmm.buildFromData(sampleData)
		self.marginal, self.emission, self.transition = self.hmm.persistHMM()
		self.actualStates, self.observations = self.parseFileCorrecting(fileCorrecting)
		self.beforeAndAfterStates = self.observations
		self.correctedText = None
	
	def parseFileCorrecting(self, filename):
		f = open(filename, "r")
		actualStates = []
		observations = []

		for line in f:
			if " " in line:
				lineSplit = line.split(" ")
				actualStates.append(lineSplit[0])
				observations.append(lineSplit[1][0])

		f.close()
		return actualStates, observations


	def determineProbableSequence(self):

		self.beforeAndAfterStates = []

		pathProbabilities = [{}]
		path = {}

		states = []
		for i in range(ord("a"), ord("z") + 1):
			states.append(chr(i))
		states.append("_")

		# Calculate Initial Probabilities
		for state in states:
			# V( (t = 0) , X(0) ) = P( E(0) | X(0) ) * P(state) 
			# Use logs since numbers can potentially be very small
			pathProbabilities[0][state] = ( 
							math.log(self.emission[(self.observations[0], state)])
							+ math.log(self.marginal[state]))
			path[state] = state


		
		for obs in range(1, len(self.observations)): # Number of states we have
			updatedPath = {}
			pathProbabilities.append({})

			# V( (t), X(t) ) = P( E(t) | X(t) ) * P( X(t) | X(t-1) ) * V( (t-1), X(t-1))
			for curState in states:
				stateProbs = {}
				for prevState in states:
					stateProbs[prevState] = (
							math.log(self.emission[(self.observations[obs], curState)])
						  + math.log(self.transition[(curState, prevState)]) 
						  + pathProbabilities[obs-1][prevState]
						)
				nextState = max(stateProbs, key = stateProbs.get)
				probOfNextState = stateProbs[nextState]

				pathProbabilities[obs][curState] = probOfNextState
				updatedPath[curState] = path[nextState] + curState
			path = updatedPath
			


		finalState = max(pathProbabilities[len(pathProbabilities)-1], key = pathProbabilities[len(pathProbabilities)-1].get)

		correctedText = path[finalState]
		correctedText = correctedText.replace("_", " ")

		for char in path[finalState]:
			self.beforeAndAfterStates.append(char)

		self.correctedText = correctedText



	def calculateErrorRate(self):
		
		mismatches = 0.0
		for i in range(0, len(self.beforeAndAfterStates) - 1):
			if self.beforeAndAfterStates[i] != self.actualStates[i]:
				mismatches += 1
		return mismatches/len(self.beforeAndAfterStates)


	def formatAndPrint(self, initialErrors, finalErrors):
		for char in self.beforeAndAfterStates:
			print char
		print "\n\n"
		print "Iintial Error Rate:", (initialErrors * 100), "%"
		print "Final Error Rate:", (finalErrors * 100), "%"