forked from Bilingual-Annotation-Task-Force/python-tagger
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Evaluation.py
executable file
·233 lines (203 loc) · 8.89 KB
/
Evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# Evaluation.py
# Using Python 2.7.11
#trial line August 11, 2016
import sys
import re
import io
from HiddenMarkovModel import HiddenMarkovModel
import string
from nltk.tag.stanford import StanfordNERTagger
from collections import Counter
from CharNGram import *
from CodeSwitchedLanguageModel import CodeSwitchedLanguageModel
import math
import csv
""" Splits text input into words and formats them, splitting by whitespace
@param text a string of text
@return a list of formatted words
"""
# case-insensitive tokenizer for ngram probabilities only
def toWords(text): # separates punctuation
# requires utf-8 encoding
token = re.compile(ur'[\w]+|[^\s\w]', re.UNICODE)
tokens = re.findall(token, text)
return [word.lower() for word in tokens]
"""
def toWords(text): # splits on white space
tokens = re.sub("\t|\n|\r", "", text)
return [word.lower() for word in tokens.split()]
"""
def toWordsCaseSen(text): # separates punctuation
# requires utf-8 encoding
token = re.compile(ur'[\w]+|[^\s\w]', re.UNICODE)
return re.findall(token, text)
"""
def toWordsCaseSen(text): # splits on white space
tokens = re.sub("\t|\n|\r", "", text)
return tokens.split()
"""
# Return a transition matrix built from the gold standard
# Pass in tags for both languages
def getTransitions(tags, lang1, lang2):
transitions = {lang1: {}, lang2: {}}
counts = Counter(zip(tags, tags[1:]))
total = sum(counts.values()) # Get new total for language tags
for (x, y), c in counts.iteritems(): # Compute transition matrix
transitions[x][y] = math.log(c / float(total))
return transitions
class Evaluator:
def __init__(self, cslm, transitions, tags):
self.cslm = cslm
self.transitions = transitions
self.tags = tags
self.engClassifier = StanfordNERTagger(
"../stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz",
"../stanford-ner-2015-04-20/stanford-ner.jar")
self.spanClassifier = StanfordNERTagger(
"../stanford-ner-2015-04-20/classifiers/spanish.ancora.distsim.s512.crf.ser.gz",
"../stanford-ner-2015-04-20/stanford-ner.jar")
def tagger(self, text_list):
hmm = HiddenMarkovModel(text_list, self.tags, self.transitions, self.cslm)
hmmtags = hmm.generateTags() # generate list of hmm tags
words = hmm.words # generate list of words
taggedTokens = []
prevLang = "Eng"
engTags = []
spnTags = []
engTag = ""
spanTag = ""
token = re.compile(ur'[^\w\s]', re.UNICODE)
print "Tagging {} words".format(len(words))
for k, word in enumerate(words):
# check if punctuation else use hmmtag
lang = 'Punct' if re.match(token, word) and not word[-1].isalpha() else hmmtags[k]
lang = 'Num' if word.isdigit() else lang
# check if word is NE
if lang != "Punct":
index = k % 1000
if index == 0:
engTags = self.engClassifier.tag(words[k:k+1000])
spnTags = self.spanClassifier.tag(words[k:k+1000])
engTag = engTags[index][1]
spanTag = spnTags[index][1]
else:
engTag = "O"
spanTag = "O"
# mark as NE if either classifier identifies it
if engTag != 'O' or spanTag != 'O':
NE = "{}/{}".format(engTag, spanTag)
else:
NE = "O"
# record probabilities
if lang in ("Eng", "Spn"):
hmmProb = round(hmm.transitions[prevLang][lang], 2)
engProb = round(self.cslm.prob("Eng", word), 2)
spnProb = round(self.cslm.prob("Spn", word), 2)
totalProb = (hmmProb + engProb) if lang == "Eng" else (hmmProb + spnProb)
prevLang = lang
else:
hmmProb = "N/A"
engProb = "N/A"
spnProb = "N/A"
totalProb = "N/A"
taggedTokens.append((word, lang, NE, str(engProb), str(spnProb), str(hmmProb), str(totalProb)))
#taggedTokens.append((word, lang, NE))
#print word, lang, NE
return taggedTokens
# Tag testCorpus and write to output file
def annotate(self, testCorpus):
print "Annotation Mode"
with io.open(testCorpus.strip(".txt") + '_annotated.txt', 'w', encoding='utf8') as output:
text = io.open(testCorpus).read()
testWords = toWordsCaseSen(text)
tagged_rows = self.tagger(testWords)
output.write(u"Token\tLanguage\tNamed Entity\tEng-NGram Prob\tSpn-NGram Prob\tHMM Prob\tTotal Prob\n")
for row in tagged_rows:
csv_row = '\t'.join([unicode(s) for s in row]) + u"\n"
print csv_row
output.write(csv_row)
print "Annotation file written"
# Evaluate goldStandard and write to output file
def evaluate(self, goldStandard):
print "Evaluation Mode"
with io.open(goldStandard + '_outputwithHMM.txt', 'w', encoding='utf8') as output:
#create list of text and tags
lines = io.open(goldStandard, 'r', encoding='utf8').readlines()
text, gold_tags = [], []
for x in lines:
columns = x.split("\t")
text.append(columns[-2].strip())
gold_tags.append(columns[-1].strip())
# annotate text with model
annotated_output = self.tagger(text)
#tokens, lang_tags, NE_tags = map(list, zip(*annotated_output))
tokens, lang_tags, NE_tags, engProbs, spnProbs, hmmProbs, totalProbs = map(list, zip(*annotated_output))
# set counters to 0
langCorrect = langTotal = NECorrect = NETotal = 0
evaluations = []
# compare gold standard and model tags
for lang, NE, gold in zip(lang_tags, NE_tags, gold_tags):
if gold in ('Eng', 'Spn'): #evaluate language tags
langTotal += 1
if gold == lang:
langCorrect += 1
evaluations.append("Correct")
else:
evaluations.append("Incorrect")
# evaluate NE tags
elif gold == "NamedEnt":
NETotal += 1
if NE != 'O':
NECorrect += 1
evaluations.append("Correct")
else:
evaluations.append("Incorrect")
# don't evaluate punctuation
else:
evaluations.append("NA")
#write
output.write(u"Language Accuracy: {}\n".format(langCorrect / float(langTotal)))
output.write(u"NE Accuracy: {}\n".format(NECorrect / float(NETotal)))
output.write(u"Token\tGold Standard\tTagged Language\tNamed Entity\tEvaluation\n")
for all_columns in zip(text, gold_tags, lang_tags, NE_tags, evaluations):
output.write(u"\t".join(all_columns) + u"\n")
print "Evaluation file written"
"""
Process arguments
Get corpora and create NGram models
Create Code-Switch Language Model
Build Markov model with Expectation Minimization
Annotate
Evaluate
"""
# Evaluation.py goldStandard testCorpus
def main(argv):
goldStandard = io.open(argv[0], 'r', encoding='utf8')
#testCorpus = io.open(argv[1], 'r', encoding='utf8')
n = 5
#engData = toWords(io.open('./TrainingCorpora/Subtlex.US.trim.txt', 'r', encoding='utf8').read())
engData = toWords(io.open("./TrainingCorpora/EngCorpus-1m.txt",'r', encoding='utf8').read())
#spnData = toWords(io.open('./TrainingCorpora/ActivEsCorpus.txt', 'r', encoding='utf8').read())
spnData = toWords(io.open('./TrainingCorpora/MexCorpus.txt', 'r', encoding='utf8').read())
enModel = CharNGram('Eng', getConditionalCounts(engData, n), n)
esModel = CharNGram('Spn', getConditionalCounts(spnData, n), n)
cslm = CodeSwitchedLanguageModel([enModel, esModel])
tags = [u"Eng", u"Spn"]
# Split on tabs and extract the gold standard tag
goldTags = [x.split("\t")[-1].strip() for x in goldStandard.readlines()]
otherSpn = ["NonStSpn", "SpnNoSpace"]
otherEng = ["NonStEng", "EngNoSpace", "EngNonSt"]
# Convert all tags to either Eng or Spn and remove others
goldTags = ["Eng" if x in otherEng else x for x in goldTags]
goldTags = ["Spn" if x in otherSpn else x for x in goldTags]
goldTags = [x for x in goldTags if x in ("Eng", "Spn")]
# Compute prior based on gold standard
transitions = getTransitions(goldTags, tags[0], tags[1])
eval = Evaluator(cslm, transitions, tags)
eval.annotate(argv[1])
eval.evaluate(argv[0])
# Use an array of arguments?
# Should user pass in number of characters, number of languages, names of
# languages?
if __name__ == "__main__":
main(sys.argv[1:]) # Skip over script name