forked from rlsummerscales/acres
-
Notifications
You must be signed in to change notification settings - Fork 0
/
filterabstracts.py
executable file
·133 lines (114 loc) · 4.48 KB
/
filterabstracts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python
# author: Rodney Summerscales
import sys
import shutil
import glob
import xml.dom
import nltk.tokenize.treebank
import nltk.stem.wordnet
import xmlutil
import sentence
import tokenlist
import costvaluefinder
import lemmatizeabstracts
def keepForIschemiaCorpus(xmldoc):
""" Return True if we should keep this abstract for the ischemia corpus
Include abstract in ischemia corpus if it contains at least 4 integers.
"""
textNodeList = xmldoc.getElementsByTagName('AbstractText')
nIntegers = 0
for textNode in textNodeList:
text = xmlutil.getText(textNode)
tokens = tokenizer.tokenize(text)
for token in tokens:
if token.isInteger():
nIntegers += 1
return nIntegers > 3
def keepForDiabetesCorpusCostValue(xmldoc):
""" Return True if we should keep this abstract for the diabetes corpus
Include abstract in diabetes corpus if it contains at least *one* currency value.
"""
textNodeList = xmldoc.getElementsByTagName('AbstractText')
nCostValues = 0
for textNode in textNodeList:
text = xmlutil.getText(textNode)
sentenceList = sentenceSplitter.tokenize(text)
for sText in sentenceList:
tokenTextList = tokenizer.tokenize(sText)
tokenList = tokenlist.TokenList()
tokenList.convertStringList(tokenTextList)
s = sentence.Sentence(tokenList)
for token in s:
lemmatizeabstracts.lemmatizeToken(token)
if cvFinder.tokenIsCostValue(token):
nCostValues += 1
return nCostValues > 0
def keepForDiabetesCorpus(xmldoc):
""" Return True if we should keep this abstract for the diabetes corpus
Include abstract in diabetes corpus if it contains at least one cost value or term.
"""
abstractNodes = xmldoc.getElementsByTagName('Abstract')
if abstractNodes is None or len(abstractNodes) == 0:
return False
textNodeList = abstractNodes[0].getElementsByTagName('AbstractText')
if textNodeList is None or len(textNodeList) == 0:
return False
nCostValues = 0
nCostTerms = 0
tokenCount = 0
cueLemmaSet = {"cost", "QALY", "QALYs"}
for textNode in textNodeList:
text = xmlutil.getText(textNode)
sentenceList = sentenceSplitter.tokenize(text)
for sText in sentenceList:
tokenTextList = tokenizer.tokenize(sText)
tokenList = tokenlist.TokenList()
tokenList.convertStringList(tokenTextList)
s = sentence.Sentence(tokenList)
for token in s:
tokenCount += 1
lemmatizeabstracts.lemmatizeToken(token)
if token.lemma in cueLemmaSet or token.text.find('cost') >= 0:
nCostTerms += 1
if cvFinder.tokenIsCostValue(token):
nCostValues += 1
return (nCostValues > 0 or nCostTerms > 0) and tokenCount > 100
if len(sys.argv) < 3:
print "Usage: filterabstracts.py <INPUT_PATH> <OUTPUT_PATH> <IGNORE_FILE>"
print "Read MEDLINE XML abstracts in the directory specified by <INPUT_PATH>"
print "Copy those abstracts that contain at least 4 integers to <OUTPUT_PATH>"
print "Ignore abstracts found in the file <IGNORE_FILE>"
sys.exit()
inputPath = sys.argv[1]
outputPath = sys.argv[2]
# build list of abstracts to ignore (possibly used in another corpus)
ignoreSet = set([])
if len(sys.argv) > 3:
ignoreFile = sys.argv[3]
file = open(ignoreFile, 'r')
for line in file.readlines():
[pmid, xml] = line.split('.')
ignoreSet.add(pmid)
if inputPath[-1] != '/':
inputPath += '/'
if outputPath[-1] != '/':
outputPath += '/'
# initialize sentence splitter and tokenizer
sentenceSplitter = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
cvFinder = costvaluefinder.CostValueFinder()
fileList = glob.glob(inputPath+'*.xml')
for filename in fileList:
xmldoc = xml.dom.minidom.parse(filename)
pmidNodes = xmldoc.getElementsByTagName('PMID')
if len(pmidNodes) > 0:
pmid = xmlutil.getText(pmidNodes[0])
if pmid in ignoreSet:
print pmid, 'already annotated'
else:
# if keepForIschemiaCorpus(xmldoc):
if keepForDiabetesCorpus(xmldoc):
# copy abstract
print 'Copying: ', filename
shutil.copy(filename, outputPath)