-
Notifications
You must be signed in to change notification settings - Fork 0
/
SpellCorrect.py
172 lines (146 loc) · 6.95 KB
/
SpellCorrect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import math
from Datum import Datum
from Sentence import Sentence
from HolbrookCorpus import HolbrookCorpus
from UniformLanguageModel import UniformLanguageModel
from UnigramLanguageModel import UnigramLanguageModel
from StupidBackoffLanguageModel import StupidBackoffLanguageModel
from StupidBackoffTrigramLanguageModel import StupidBackoffTrigramLanguageModel
from LaplaceUnigramLanguageModel import LaplaceUnigramLanguageModel
from LaplaceBigramLanguageModel import LaplaceBigramLanguageModel
from LaplaceTrigramLanguageModel import LaplaceTrigramLanguageModel
from LaplaceFourgramLanguageModel import LaplaceFourgramLanguageModel
from CustomLanguageModel import CustomLanguageModel
from CustomLanguageModel2 import CustomLanguageModel2
from EditModel import EditModel
from SpellingResult import SpellingResult
import types
# Modified version of Peter Norvig's spelling corrector
"""Spelling Corrector.
Copyright 2007 Peter Norvig.
Open source code under MIT license: http://www.opensource.org/licenses/mit-license.php
"""
import re, collections
class SpellCorrect:
"""Spelling corrector for sentences. Holds edit model, language model and the corpus."""
def __init__(self, lm, corpus):
self.languageModel = lm
self.editModel = EditModel('data/count_1edit.txt', corpus)
def correctSentence(self, sentence):
"""Assuming exactly one error per sentence, returns the most probable corrected sentence.
Sentence is a list of words."""
if len(sentence) == 0:
return []
bestSentence = sentence[:] #copy of sentence
bestScore = float('-inf')
for i in range(1, len(sentence) - 1): #ignore <s> and </s>, iterate each word
# TODO: select the maximum probability sentence here, according to the noisy channel model.
# Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
# You should iterate through these values instead of enumerating all edits.
"""
if the misspelling word is 'hallo', it gives the list of pairs (edited word, P(hallo|edited word))
the list will be like
[('hello', -1.5),...('hall', -2.1),...('fool', -10.5),...]
iterate and choose the most probable misspelling
"""
# Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence
"""
if the original sentence is ['I', 'said', 'hallo'], it returns
logP(W_original) = logP(I) + logP(said) + logP(hallo)
In this case, both logP(I) and logP(said) are large, but logP(hallo) is small,
therefore, logP(I) + logP(said) + logP(hello) + logP(hallo|hello) may be larger than logP(W_original)
"""
candidate_list = self.editModel.editProbabilities(sentence[i]) # get the list of (correction, log-probability)
for candidate in candidate_list:
new_sentence = sentence[:]
new_sentence[i] = candidate[0] # replace i-th word with candidate
# get the score of "new sentence probability + conditinal probability"
probability = self.languageModel.score(new_sentence) + candidate[1]
if probability > bestScore:
bestScore = probability
bestSentence = new_sentence
return bestSentence
def evaluate(self, corpus):
"""Tests this speller on a corpus, returns a SpellingResult"""
numCorrect = 0
numTotal = 0
testData = corpus.generateTestCases()
for sentence in testData:
if sentence.isEmpty():
continue
errorSentence = sentence.getErrorSentence()
hypothesis = self.correctSentence(errorSentence)
if sentence.isCorrection(hypothesis):
numCorrect += 1
numTotal += 1
return SpellingResult(numCorrect, numTotal)
def correctCorpus(self, corpus):
"""Corrects a whole corpus, returns a JSON representation of the output."""
string_list = [] # we will join these with commas, bookended with []
sentences = corpus.corpus
for sentence in sentences:
uncorrected = sentence.getErrorSentence()
corrected = self.correctSentence(uncorrected)
word_list = '["%s"]' % '","'.join(corrected)
string_list.append(word_list)
output = '[%s]' % ','.join(string_list)
return output
def main():
"""Trains all of the language models and tests them on the dev data. Change devPath if you
wish to do things like test on the training data."""
trainPath = 'data/holbrook-tagged-train.dat'
trainingCorpus = HolbrookCorpus(trainPath)
devPath = 'data/holbrook-tagged-dev.dat'
devCorpus = HolbrookCorpus(devPath)
print ('Unigram Language Model: ' )
unigramLM = UnigramLanguageModel(trainingCorpus)
unigramSpell = SpellCorrect(unigramLM, trainingCorpus)
unigramOutcome = unigramSpell.evaluate(devCorpus)
print (str(unigramOutcome))
print ('Uniform Language Model: ')
uniformLM = UniformLanguageModel(trainingCorpus)
uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
uniformOutcome = uniformSpell.evaluate(devCorpus)
print (str(uniformOutcome))
print ('Laplace Unigram Language Model: ')
laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
print (str(laplaceUnigramOutcome))
print ('Laplace Bigram Language Model: ')
laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
print (str(laplaceBigramOutcome))
print ('Laplace Trigram Language Model: ')
laplaceTrigramLM = LaplaceTrigramLanguageModel(trainingCorpus)
laplaceTrigramSpell = SpellCorrect(laplaceTrigramLM, trainingCorpus)
laplaceTrigramOutcome = laplaceTrigramSpell.evaluate(devCorpus)
print (str(laplaceTrigramOutcome))
print ('Laplace 4gram Language Model: ')
laplaceFourgramLM = LaplaceFourgramLanguageModel(trainingCorpus)
laplaceFourgramSpell = SpellCorrect(laplaceFourgramLM, trainingCorpus)
laplaceFourgramOutcome = laplaceFourgramSpell.evaluate(devCorpus)
print (str(laplaceFourgramOutcome))
print ('Stupid Backoff Language Model: ')
sbLM = StupidBackoffLanguageModel(trainingCorpus)
sbSpell = SpellCorrect(sbLM, trainingCorpus)
sbOutcome = sbSpell.evaluate(devCorpus)
print (str(sbOutcome))
print('Stupid Backoff Trigram Language Model: ')
sbLM = StupidBackoffTrigramLanguageModel(trainingCorpus)
sbSpell = SpellCorrect(sbLM, trainingCorpus)
sbOutcome = sbSpell.evaluate(devCorpus)
print(str(sbOutcome))
print ('Custom Language Model: ')
customLM = CustomLanguageModel(trainingCorpus)
customSpell = SpellCorrect(customLM, trainingCorpus)
customOutcome = customSpell.evaluate(devCorpus)
print (str(customOutcome))
print('Custom Language Model 2: ')
customLM = CustomLanguageModel2(trainingCorpus)
customSpell = SpellCorrect(customLM, trainingCorpus)
customOutcome = customSpell.evaluate(devCorpus)
print(str(customOutcome))
if __name__ == "__main__":
main()