-
Notifications
You must be signed in to change notification settings - Fork 0
/
syntacticCorrection.py
132 lines (111 loc) · 5.22 KB
/
syntacticCorrection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
__author__ = 'SEOKHO'
from WindowProb import WindowProb
from textblob import TextBlob
from textblob_aptagger import PerceptronTagger
import generateTagWindows
import PosLikelihood
import lexicalizedTagWindows
import pickle
import random
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import sklearn
from sklearn.metrics import classification_report
import math
from PosLikelihood import SynReplacer
#syntactic correctional model
class SynCorrection:
def __init__(self, winSize, lex = False):
self.winSize = winSize
self.lex = lex
self.lexFilename = "Lex" if lex else ""
self.winMod = WindowProb("C:/MissingWord/post"+self.lexFilename+"ModComp"+str(self.winSize)+".pickle", compressed = True)
self.winOrig = WindowProb("C:/MissingWord/post"+self.lexFilename+"Comp"+str(self.winSize)+".pickle", compressed = True)
with open("toLexicalize.pickle", "rb") as f:
self.toLexicalize = pickle.load(f)
self.aptagger = PerceptronTagger()
def correct(self, tokens):
blob = TextBlob(' '.join(tokens), pos_tagger = self.aptagger)
completeTags = generateTagWindows.getCompleteTags(blob)
if self.lex == True:
completeTags = lexicalizedTagWindows.lexicalizeTags(completeTags, tokens, self.toLexicalize)
#print(lexicalizedTags)
windows = generateTagWindows.makeWindows(completeTags, size = self.winSize)
probs = []
confidence = []
for window in windows:
prob = (self.winMod.count(window) + 1) / ((self.winMod.count(window) + self.winOrig.count(window)) + 1)
probs.append(prob)
confidence.append(math.log(self.winMod.count(window) + self.winOrig.count(window) + 1))
#if self.winSize == 5:
# print(self.winMod.count(window) + self.winOrig.count(window))
return probs, confidence
def makeFeatures(synCor, synCorLex, synRepl, cutTokens, removedIndex):
corrProbs, corrConfidence = synCor.correct(cutTokens)
probWindows = generateTagWindows.makeWindows(corrProbs, size = 9, filler = 0.0)
#corrConfWindows = generateTagWindows.makeWindows(corrConfidence, size = 3, filler = 0.0)
window = list(probWindows[removedIndex])
window.append(max(probWindows[removedIndex]) - max(corrProbs)) # difference between top probability in window and top probability in all possible locations
#window.extend(corrConfWindows[removedIndex])
lexCorrProbs, lexCorrConfidence = synCorLex.correct(cutTokens)
lexCorrProbWindows = generateTagWindows.makeWindows(lexCorrProbs, size = 9, filler = 0.0)
#lexCorrConfWindows = generateTagWindows.makeWindows(lexCorrConfidence, size = 3, filler = 0.0)
window.extend(lexCorrProbWindows[removedIndex])
#window.extend(lexCorrConfWindows[removedIndex])
window.append(max(lexCorrProbWindows[removedIndex]) - max(lexCorrProbs))
window.append(math.log(len(cutTokens)))
replProbs = synRepl.fix(cutTokens, removedIndex)
window.extend(replProbs[:5])
return window
def testSentence():
sentence = "Japan has suspended of buffalo mozzarella from Italy , after reports that high levels of dioxin have been found in the cheese ."
synCor = SynCorrection(4, lex = True)
print(synCor.correct(sentence.split(" ")))
def main():
synCor = SynCorrection(5)
synCorLex = SynCorrection(4, lex = True)
synRep = SynReplacer(lex = True)
posWindows = []
negWindows = []
with open("C:/MissingWord/train/corpusPart2.txt", "r") as f:
for index, line in enumerate(f):
line = line.strip()
if len(line) > 1:
tokens = line.split(" ")
if len(tokens) > 3:
removed = random.randint(1, len(tokens) - 2)
cutTokens = tokens.copy()
del cutTokens[removed]
posWindows.append(makeFeatures(synCor, synCorLex, synRep, cutTokens, removed))
for i in range(3):
negWindowIndex = random.randint(1, len(tokens) - 2)
if abs(negWindowIndex - removed) > 0:
negWindows.append(makeFeatures(synCor, synCorLex, synRep, cutTokens, negWindowIndex))
if index > 10000:
break
data = []
for window in posWindows:
data.append((window, 1))
for window in negWindows:
data.append((window, 0))
random.shuffle(data)
cutoff = int(len(data) * 7 / 10)
trainFeatures = [datum[0] for datum in data[:cutoff]]
trainLabels = [datum[1] for datum in data[:cutoff]]
testFeatures = [datum[0] for datum in data[cutoff:]]
testLabels = [datum[1] for datum in data[cutoff:]]
for i in range(10):
print(trainFeatures[i])
trainFeatures = np.array(trainFeatures)
testFeatures = np.array(testFeatures)
clf = SVC(C=1)
#clf = RandomForestClassifier(n_estimators = 1000)
clf.fit(trainFeatures, trainLabels)
trainingPred = clf.predict(trainFeatures)
print(classification_report(trainLabels, trainingPred))
print(classification_report(testLabels, clf.predict(testFeatures)))
with open("synCorrTight.clf", "wb") as f:
pickle.dump(clf, f)
if __name__ == "__main__":
testSentence()