forked from rlsummerscales/acres
-
Notifications
You must be signed in to change notification settings - Fork 0
/
autoannotate.py
executable file
·411 lines (377 loc) · 15.7 KB
/
autoannotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
#!/usr/bin/python
# author: Rodney Summerscales
import sys
#import nltk
#from nltk.corpus import wordnet as wn
from abstractlist import AbstractList
from statlist import StatList
from findertask import FinderTask
from rulebasedfinder import RuleBasedFinder
class AutoAnnotate(RuleBasedFinder):
""" Use trial registries to annotate abstracts.
"""
# entityTypes = ['condition', 'group', 'outcome']
entityTypes = ['group', 'outcome']
def __init__(self):
""" Create a finder that labels tokens with a given type if they have this annotation.
"""
RuleBasedFinder.__init__(self, self.entityTypes)
def test(self, absList, modelFilename):
""" Apply the mention finder to a given list of abstracts
using the given model file.
"""
for abstract in absList:
# find and tag untagged instances in abstract that match registry entries
if abstract.report != None:
registryEntries = {}
registryEntries['group'] = []
registryEntries['outcome'] = []
# registryEntries['condition'] = []
print abstract.id, '------------------------------------------'
print abstract.report.id
for intervention in abstract.report.interventions:
if len(intervention.name) == 1:
print 'R - Group:', intervention.name[0].toString()
if len(intervention.name[0]) < 5:
registryEntries['group'].append(intervention.name[0])
else:
print '(Discarded)'
print '---'
for sentence in abstract.sentences:
mList = sentence.getAnnotatedMentions('group', recomputeMentions=True)
for mention in mList:
print 'A - Group:', mention.text
print '==='
for outcome in abstract.report.outcomes:
if len(outcome.name) == 1:
print 'R - outcome:', outcome.name[0].toString()
if len(outcome.name[0]) < 7:
registryEntries['outcome'].append(outcome.name[0])
else:
print '(Discarded)'
print '---'
for sentence in abstract.sentences:
mList = sentence.getAnnotatedMentions('outcome', recomputeMentions=True)
for mention in mList:
print 'A - outcome:', mention.text
# for eCriteria in abstract.report.exclusionCriteria:
# if len(eCriteria.sentences) == 1:
# print 'condition:', eCriteria.sentences[0].toString()
# registryEntries['condition'].append(eCriteria.sentences[0])
#
# for iCriteria in abstract.report.inclusionCriteria:
# if len(iCriteria.sentences) == 1:
# print 'condition:', iCriteria.sentences[0].toString()
# registryEntries['condition'].append(iCriteria.sentences[0])
if len(registryEntries['group']) == 0 or len(registryEntries['outcome']) == 0:
# registry not useful, candidate abstract not useful for training
abstract.report = None
else:
# self.GroupFilter(abstract)
for mType, sentenceList in registryEntries.items():
# nMatches = self.findRepeats(abstract, mType, sentenceList)
# nMatches = self.labelUMLSMatches(abstract, mType, sentenceList)
nMatches2 = self.labelMatches(abstract, mType, sentenceList)
# nMatches2 = self.labelMatches2(abstract, mType, sentenceList)
# self.expandMentions(abstract, mType)
# nGroups = self.countMentions(abstract, 'group')
# nOutcomes = self.countMentions(abstract, 'outcome')
# print 'nGroups =',nGroups, 'nOutcomes =',nOutcomes
# if nGroups < 1 or nOutcomes < 1:
# # ignore abstracts that do not have any annotated groups or outcomes
# abstract.report = None
for abstract in absList[:]:
if abstract.report == None:
absList.remove(abstract)
def countMentions(self, abstract, mType):
""" count the number of mentions annotated by system """
nMentions = 0
for sentence in abstract.sentences:
mList = sentence.getDetectedMentions(mType, recomputeMentions=True)
nMentions += len(mList)
return nMentions
def expandMentions(self, abstract, mType):
""" expand the mentions to include all tokens in current phrase """
for sentence in abstract.sentences:
for simpleTreeTokenNode in sentence.getSimpleTree().tokenNodes():
if simpleTreeTokenNode.isNounPhraseNode():
npTokens = simpleTreeTokenNode.tokenList()
labelAllTokens = False
for token in npTokens:
if token.hasLabel(mType):
labelAllTokens = True
break
if labelAllTokens:
for token in npTokens:
token.addLabel(mType)
def labelUMLSMatches(self, abstract, mType, registryEntries):
""" find all word sequences in abstract that match words in a give set
of registry entries.
Label all identified words sequences """
if len(registryEntries) == 0:
return 0
nMatches = 0
ignoreWords = set(['a', 'the', 'of', 'in', 'for', 'group', 'groups', 'arm'])
# print 'Looking for missed', mType, 'mentions'
conceptList = []
conceptIDs = set([])
# build list of detected mentions in abstract
for sentence in registryEntries:
# get all detected mentions in sentence
for chunk in sentence.umlsChunks:
bestConcepts = chunk.getBestConcepts()
conceptList += bestConcepts
for concept in bestConcepts:
conceptIDs.add(concept.id)
for sentence in abstract.sentences:
for chunk in sentence.umlsChunks:
bestConcepts = chunk.getBestConcepts()
for concept in bestConcepts:
if concept.id in conceptIDs:
nMatches += 1
for token in chunk.getTokens():
token.addLabel(mType)
break
return nMatches
def labelMatches(self, abstract, mType, registryEntries):
""" find all word sequences in abstract that match words in a give set
of registry entries.
Label all identified words sequences """
if len(registryEntries) == 0:
return
nMatches = 0
ignoreWords = set(['a', 'the', 'of', 'in', 'for', 'group', 'groups', 'arm'])
# print 'Looking for missed', mType, 'mentions'
mentionList = []
tokenSet = set([])
# build list of detected mentions in abstract
for sentence in registryEntries:
# get all detected mentions in sentence
tokenSet = set([])
for token in sentence:
if token.isSymbol() == False and token.isStopWord() == False \
and token.isNumber() == False:
tokenSet.add(token.text)
tokenSet.add(token.lemma)
# mentionList.append(tokenSet)
print tokenSet
for sentence in abstract.sentences:
for simpleTreeTokenNode in sentence.getSimpleTree().tokenNodes():
if simpleTreeTokenNode.isNounPhraseNode():
npTokens = simpleTreeTokenNode.tokenList()
labelAllTokens = False
for token in npTokens:
if token.text in tokenSet:
labelAllTokens = True
break
if labelAllTokens:
for token in npTokens:
token.addLabel(mType)
# for sentence in abstract.sentences:
# i = 0
# # print sentence.toString()
# # check each token to see if it matches a detected mention
# while i < len(sentence):
# maxTokensMatched = 0
# bestMatch = None
# for mention in mentionList:
# j = 0
# nImportantWords = 0
# keepMatching = True
# while keepMatching and i+j < len(sentence):
# token = sentence[i+j]
# if token.text in mention or token.lemma in mention:
# keepMatching = True
# elif token.isAcronym():
# # token is an acronym check if all tokens in expansion in mention
# expansionTokens = token.getAcronymExpansion()
# if len(expansionTokens) == 0:
# keepMatching = False
# for eToken in expansionTokens:
# if eToken.text not in mention:
# keepMatching = False
# else:
# keepMatching = False
# if keepMatching:
# j += 1
# if token.isStopWord() == False and token.isSymbol() == False \
# and token.isNumber() == False:
# nImportantWords += 1
#
# if j > maxTokensMatched and nImportantWords > 0:
# maxTokensMatched = j
# bestMatch = mention
#
# if maxTokensMatched == 0:
# # no match, move to next token
# i = i + 1
# else:
# nMatches += 1
# for j in range(i, i+maxTokensMatched):
# token = sentence[j]
# # print abs.id, ': Tagging', token.text, 'as', mType
# token.addLabel(mType)
# i = i + maxTokensMatched
return nMatches
def labelMatches2(self, abstract, mType, registryEntries):
""" find all word sequences in abstract that match words in a give set
of registry entries.
Label all identified words sequences """
if len(registryEntries) == 0:
return
nMatches = 0
ignoreWords = set(['a', 'the', 'of', 'in', 'for', 'group', 'groups', 'arm'])
# print 'Looking for missed', mType, 'mentions'
mentionList = []
# build list of detected mentions in abstract
for sentence in registryEntries:
# get all detected mentions in sentence
tokenSet = set([])
for token in sentence:
if token.isSymbol() == False:
tokenSet.add(token.text)
tokenSet.add(token.lemma)
mentionList.append(tokenSet)
for sentence in abstract.sentences:
i = 0
# print sentence.toString()
# check each token to see if it matches a detected mention
while i < len(sentence):
maxTokensMatched = 0
bestMatch = None
for mention in mentionList:
j = 0
nImportantWords = 0
keepMatching = True
while keepMatching and i+j < len(sentence):
token = sentence[i+j]
if token.text in mention or token.lemma in mention:
keepMatching = True
elif token.isAcronym():
# token is an acronym check if all tokens in expansion in mention
expansionTokens = token.getAcronymExpansion()
if len(expansionTokens) == 0:
keepMatching = False
for eToken in expansionTokens:
if eToken.text not in mention:
keepMatching = False
else:
keepMatching = False
if keepMatching:
j += 1
if token.isStopWord() == False and token.isSymbol() == False \
and token.isNumber() == False:
nImportantWords += 1
if j > maxTokensMatched and nImportantWords > 0:
maxTokensMatched = j
bestMatch = mention
if maxTokensMatched == 0:
# no match, move to next token
i = i + 1
else:
nMatches += 1
for j in range(i, i+maxTokensMatched):
token = sentence[j]
# print abs.id, ': Tagging', token.text, 'as', mType
token.addLabel(mType)
i = i + maxTokensMatched
return nMatches
def GroupFilter(self, abstract):
""" apply simple rules to list of abstracts to recognize groups """
commonGroupWords = set(['intervention', 'control', 'controls', 'group', \
'placebo'])
label = 'group'
groupWords = set(['group', 'arm'])
for sentence in abstract.sentences:
for simpleTreeTokenNode in sentence.getSimpleTree().tokenNodes():
if simpleTreeTokenNode.isNounPhraseNode() \
and simpleTreeTokenNode.headToken().text in groupWords:
nImportantWords = 0
phraseTokens = simpleTreeTokenNode.tokenList()
for token in phraseTokens:
if token.isStopWord() == False and token.isSymbol() == False \
and token.isNumber() == False and token.text != 'group':
nImportantWords += 1
break
if nImportantWords > 0:
for token in phraseTokens:
token.addLabel(label)
def findRepeats(self, abstract, mType, registryEntries):
""" find untagged token sequences that match those from detected mentions
and tag them """
ignoreWords = set(['a', 'the', 'of', 'in', 'for', 'group', 'groups', 'arm'])
# print 'Looking for missed', mType, 'mentions'
mentions = []
# get all detected mentions in sentence
for sentence in registryEntries:
longTokenSet = set([])
shortTokenSet = set([])
for token in sentence.tokens:
longTokenSet.add(token.text)
if token.text not in ignoreWords and token.isSymbol() == False:
shortTokenSet.add(token.text)
mentions.append(longTokenSet)
mentions.append(shortTokenSet)
# find and tag untagged instances in abstract that match detected mentions
nMatches = 0
for sentence in abstract.sentences:
if len(mentions) > 0:
i = 0
# check each token to see if it matches a detected mention
while i < len(sentence):
nMatchedTokens = 0
curMentionList = mentions
longestMatch = None
# match current token (and those following it) to detected mentions
while len(curMentionList) > 0 \
and (i+nMatchedTokens) < len(sentence):
nextMentionList = []
# look for detected mentions that have this token
for mTokenSet in curMentionList:
if sentence[i+nMatchedTokens].text in mTokenSet:
# current token matches a token in the mention
if (nMatchedTokens+1) == len(mTokenSet):
# we have matched this entire mention
# it is currently the longest mention that we have
# matched all of the tokens from
longestMatch = mTokenSet
else:
# there are still more tokens in this mention that
# we need to match
nextMentionList.append(mTokenSet)
# move to set of mentions that have the most matches so far
curMentionList = nextMentionList
if len(curMentionList) > 0:
nMatchedTokens = nMatchedTokens + 1
if longestMatch == None:
# no match, move to next token
i = i + 1
else:
nMatches += 1
nMatchedTokens = len(longestMatch)
for j in range(i, i+nMatchedTokens):
token = sentence[j]
# print abs.id, ': Tagging', token.text, 'as', mType
token.addLabel(mType)
i = i + nMatchedTokens
return nMatches
#############################################################################
if len(sys.argv) < 2:
print "Usage: autoannotate.py <INPUT_PATH>"
print "Automatically annotate a collection of abstracts with trial registries"
print "in the directory specified by <INPUT_PATH>"
print "using rules and trial registry info."
sys.exit()
statList = StatList()
inputPath = sys.argv[1]
absList = AbstractList(inputPath)
finder = AutoAnnotate()
finderTask = FinderTask(finder)
finderTask.test(absList, statList)
# absList.labelsToAnnotations(['group', 'outcome'])
#
# for abs in absList:
# if abs.report != None:
# abs.writeXML(abs.id+'.auto.xml')
#
# statList.write('stats.auto.txt', separator=',')