Example #1
0
class SpellCorrection:
    """
    Holds edit model, language model, corpus, trains
    """
    
    def __init__(self, lm, corpus):
        self.lm = lm
        self.editModel = EditModel("./data/count_1edit.txt", corpus)
    
    @timeit    
    def evaluation(self, corpus):
        
        """
        Tests this speller on a corpus
        Returns a spelling result
        """
        
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            # get any possible spell error sentence
            errorSentence = sentence.getErrorSentence()
            # use specific language model to guess highest possible corrected sentence
            hypothesis = self.correctSentence(errorSentence)
            # use test data to check correctness
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellResult(numCorrect, numTotal)
        
    def correctSentence(self, sentence):
        
        """
        Takes a list of words, including words or error
        Returns a corrected list of words.
        """
        
        if len(sentence) == 0:
            return []
        argmax_index = 0
        argmax_word = sentence[0]
        maxscore = float('-inf')
        maxlm = float('-inf')
        maxedit = float('-inf')
        
        # skip start and end tokens
        for i in range(1, len(sentence)-1):
            word = sentence[i]
            # return a dictionary {corrected-word: P(corrected-word|misspelled-word)} given a might-mis-spelled word
            editProbs = self.editModel.getProbabilities(word) 
            for alternative, editscore in editProbs.items():
                # no mis-spell happened, pass
                if alternative == word:
                    continue
                sentence[i] = alternative
                # get score of the corrected-sentence from language model
                lmscore = self.lm.score(sentence)
                try:
                    editscore = math.log(editscore)
                except ValueError:
                    editscore = float('-inf')
                    print word
                    print " log-probabilities = 0, go check editModel output!"
                # P_final=P(corrected_sentence)*P(corrected-word|misspelled-word);
                score = lmscore + editscore
                # find the highest one and store it
                if score >= maxscore:
                    maxscore = score
                    maxlm = lmscore
                    maxedit = editscore
                    argmax_index = i
                    argmax_word = alternative
            sentence[i] = word 
            argmax = list(sentence)
            # correct the spell error given might-mis-spelled word
            argmax[argmax_index] = argmax_word
        return argmax