forked from elainexmas/NLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SpellCorrect.py
140 lines (117 loc) · 4.71 KB
/
SpellCorrect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import math
from Datum import Datum
from Sentence import Sentence
from HolbrookCorpus import HolbrookCorpus
from UniformLanguageModel import UniformLanguageModel
from UnigramLanguageModel import UnigramLanguageModel
from StupidBackoffLanguageModel import StupidBackoffLanguageModel
from LaplaceUnigramLanguageModel import LaplaceUnigramLanguageModel
from LaplaceBigramLanguageModel import LaplaceBigramLanguageModel
from CustomLanguageModel import CustomLanguageModel
from EditModel import EditModel
from SpellingResult import SpellingResult
import types
# Modified version of Peter Norvig's spelling corrector
"""Spelling Corrector.
Copyright 2007 Peter Norvig.
Open source code under MIT license: http://www.opensource.org/licenses/mit-license.php
"""
import re, collections
class SpellCorrect:
"""Holds edit model, language model, corpus. trains"""
def __init__(self, lm, corpus):
"""initializes the language model."""
self.languageModel = lm
self.editModel = EditModel('../data/count_1edit.txt', corpus)
def evaluate(self, corpus):
"""Tests this speller on a corpus, returns a SpellingResult"""
numCorrect = 0
numTotal = 0
testData = corpus.generateTestCases()
for sentence in testData:
if sentence.isEmpty():
continue
errorSentence = sentence.getErrorSentence()
hypothesis = self.correctSentence(errorSentence)
if sentence.isCorrection(hypothesis):
numCorrect += 1
numTotal += 1
return SpellingResult(numCorrect, numTotal)
def correctSentence(self, sentence):
"""Takes a list of words, returns a corrected list of words."""
if len(sentence) == 0:
return []
argmax_i = 0
argmax_w = sentence[0]
maxscore = float('-inf')
maxlm = float('-inf')
maxedit = float('-inf')
# skip start and end tokens
for i in range(1, len(sentence) - 1):
word = sentence[i]
editProbs = self.editModel.editProbabilities(word)
for alternative, editscore in editProbs.iteritems():
if alternative == word:
continue
sentence[i] = alternative
lmscore = self.languageModel.score(sentence)
if editscore != 0:
editscore = math.log(editscore)
else:
editscore = float('-inf')
score = lmscore + editscore
if score >= maxscore:
maxscore = score
maxlm = lmscore
maxedit = editscore
argmax_i = i
argmax_w = alternative
sentence[i] = word # restores sentence to original state before moving on
argmax = list(sentence) # copy it
argmax[argmax_i] = argmax_w # correct it
return argmax
def correctCorpus(self, corpus):
"""Corrects a whole corpus, returns a JSON representation of the output."""
string_list = [] # we will join these with commas, bookended with []
sentences = corpus.corpus
for sentence in sentences:
uncorrected = sentence.getErrorSentence()
corrected = self.correctSentence(uncorrected) # List<String>
word_list = '["%s"]' % '","'.join(corrected)
string_list.append(word_list)
output = '[%s]' % ','.join(string_list)
return output
def main():
"""Trains all of the language models and tests them on the dev data. Change devPath if you
wish to do things like test on the training data."""
trainPath = '../data/holbrook-tagged-train.dat'
trainingCorpus = HolbrookCorpus(trainPath)
devPath = '../data/holbrook-tagged-dev.dat'
devCorpus = HolbrookCorpus(devPath)
print 'Uniform Language Model: '# 31/471
uniformLM = UniformLanguageModel(trainingCorpus)
uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
uniformOutcome = uniformSpell.evaluate(devCorpus)
print str(uniformOutcome)
print 'Laplace Unigram Language Model: ' # 52/471
laplaceUnigramLM = LaplaceUnigramLanguageModel(trainingCorpus)
laplaceUnigramSpell = SpellCorrect(laplaceUnigramLM, trainingCorpus)
laplaceUnigramOutcome = laplaceUnigramSpell.evaluate(devCorpus)
print str(laplaceUnigramOutcome)
print 'Laplace Bigram Language Model: ' # 64/471
laplaceBigramLM = LaplaceBigramLanguageModel(trainingCorpus)
laplaceBigramSpell = SpellCorrect(laplaceBigramLM, trainingCorpus)
laplaceBigramOutcome = laplaceBigramSpell.evaluate(devCorpus)
print str(laplaceBigramOutcome)
print 'Stupid Backoff Language Model: ' # 87/471
sbLM = StupidBackoffLanguageModel(trainingCorpus)
sbSpell = SpellCorrect(sbLM, trainingCorpus)
sbOutcome = sbSpell.evaluate(devCorpus)
print str(sbOutcome)
print 'Custom Language Model: ' # 52/471
customLM = CustomLanguageModel(trainingCorpus)
customSpell = SpellCorrect(customLM, trainingCorpus)
customOutcome = customSpell.evaluate(devCorpus)
print str(customOutcome)
if __name__ == "__main__":
main()