class SpellCorrection: """ Holds edit model, language model, corpus, trains """ def __init__(self, lm, corpus): self.lm = lm self.editModel = EditModel("./data/count_1edit.txt", corpus) @timeit def evaluation(self, corpus): """ Tests this speller on a corpus Returns a spelling result """ numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue # get any possible spell error sentence errorSentence = sentence.getErrorSentence() # use specific language model to guess highest possible corrected sentence hypothesis = self.correctSentence(errorSentence) # use test data to check correctness if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellResult(numCorrect, numTotal) def correctSentence(self, sentence): """ Takes a list of words, including words or error Returns a corrected list of words. """ if len(sentence) == 0: return [] argmax_index = 0 argmax_word = sentence[0] maxscore = float('-inf') maxlm = float('-inf') maxedit = float('-inf') # skip start and end tokens for i in range(1, len(sentence)-1): word = sentence[i] # return a dictionary {corrected-word: P(corrected-word|misspelled-word)} given a might-mis-spelled word editProbs = self.editModel.getProbabilities(word) for alternative, editscore in editProbs.items(): # no mis-spell happened, pass if alternative == word: continue sentence[i] = alternative # get score of the corrected-sentence from language model lmscore = self.lm.score(sentence) try: editscore = math.log(editscore) except ValueError: editscore = float('-inf') print word print " log-probabilities = 0, go check editModel output!" # P_final=P(corrected_sentence)*P(corrected-word|misspelled-word); score = lmscore + editscore # find the highest one and store it if score >= maxscore: maxscore = score maxlm = lmscore maxedit = editscore argmax_index = i argmax_word = alternative sentence[i] = word argmax = list(sentence) # correct the spell error given might-mis-spelled word argmax[argmax_index] = argmax_word return argmax