-
Notifications
You must be signed in to change notification settings - Fork 1
/
rankabstracts.py
executable file
·164 lines (129 loc) · 4.85 KB
/
rankabstracts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python
"""
Rank abstract summaries based on their information content and quality
"""
import sys
import glob
import xml
import xmlutil
import htmlutil
import summarylist
__author__ = 'Rodney L. Summerscales'
class XMLSummary:
"""
Class for reading an XML summary from a file
"""
id=None # pubmed id
groupNodes=None
outcomeListNode=None
htmlData=None
def __init__(self, filename):
"""Given the name of a file containing the XML summary,
parse the file and read its contents
"""
xmldoc = xml.dom.minidom.parse(filename)
pmidNodes = xmldoc.getElementsByTagName('Name')
self.id = int(xmlutil.getText(pmidNodes[0]))
subjectNodes = xmldoc.getElementsByTagName('Subjects')
if len(subjectNodes) == 0:
self.groupNodes = subjectNodes.getElementsByTagName('Group')
else:
self.groupNodes = []
olistNodes = xmldoc.getElementsByTagName('Outcomes')
if len(olistNodes) == 1:
self.outcomeListNode = olistNodes[0]
else:
self.outcomeListNode = None
htmlSummaryNodes = xmldoc.getElementsByTagName('HTMLData')
if len(htmlSummaryNodes) == 1:
self.htmlData = xmlutil.getText(htmlSummaryNodes[0])
def countARR(self):
""" Return number of ARR/NNT values in summary
"""
if self.outcomeListNode is None:
return 0
statisticNodes = self.outcomeListNode.getElementsByTagName('Statistic')
return len(statisticNodes)
def countCostValues(self):
"""
Return number of cost values in summary
"""
if self.outcomeListNode is None:
return 0
statisticNodes = self.outcomeListNode.getElementsByTagName('CostValue')
return len(statisticNodes)
def countGroups(self):
"""
Return number of Groups in summary
"""
return len(self.groupNodes)
def countOutcomes(self):
"""
Return number of Outcomes in summary
"""
if self.outcomeListNode is None:
return 0
outcomeNodes = self.outcomeListNode.getElementsByTagName('Outcome')
return len(outcomeNodes)
if __name__ == '__main__':
if len(sys.argv) < 2:
print "Usage: rankabstracts.py <INPUT_PATH> <OUTPUT_PATH> "
print "Read XML summaries of MEDLINE abstracts in the directory specified by <INPUT_PATH>"
print "Create HTML summary files that contain the most relevant summaries and write these to <OUTPUT_PATH>"
sys.exit()
inputPath = sys.argv[1]
if len(sys.argv) > 2:
outputPath = sys.argv[2]
else:
outputPath = './'
if inputPath[-1] != '/':
inputPath += '/'
if outputPath[-1] != '/':
outputPath += '/'
# build list of summaries
fileList = glob.glob(inputPath+'*.xml')
# read summaries
summaryList = []
for filename in fileList:
xmlSummary = XMLSummary(filename)
summaryList.append(xmlSummary)
# sorting hat time.
# sort summaries into three bins
# 1. No detected elements (low quality)
# 2. Some elements, but no values (mid grade)
# 3. Contains values (higher quality)
lowQuality = []
mediumQuality = []
highQuality1 = []
highQuality2 = []
for xmlSummary in summaryList:
nARR = xmlSummary.countARR()
nCostValues = xmlSummary.countCostValues()
nGroups = xmlSummary.countGroups()
nOutcomes = xmlSummary.countOutcomes()
if nARR > 0 and nCostValues > 0:
highQuality1.append((xmlSummary.id, xmlSummary))
if nARR > 0 or nCostValues > 0:
highQuality2.append((xmlSummary.id, xmlSummary))
elif nGroups > 0 or nOutcomes > 0:
mediumQuality.append((xmlSummary.id, xmlSummary))
else:
lowQuality.append((xmlSummary.id, xmlSummary))
lowQuality.sort(reverse=True)
mediumQuality.sort(reverse=True)
highQuality1.sort(reverse=True)
highQuality2.sort(reverse=True)
highQuality = highQuality1 + highQuality2
# write summaries to html file
htmlFile = htmlutil.HTMLFile(title='Least relevant EBM summaries')
for (id, xmlSummary) in lowQuality:
htmlFile.addBodyElement(xmlSummary.htmlData)
htmlFile.writeFile(outputPath+'summaries.low.html', useSummaryFormat=True)
htmlFile = htmlutil.HTMLFile(title='Somewhat relevant EBM summaries')
for (id, xmlSummary) in mediumQuality:
htmlFile.addBodyElement(xmlSummary.htmlData)
htmlFile.writeFile(outputPath+'summaries.med.html', useSummaryFormat=True)
htmlFile = htmlutil.HTMLFile(title='Most relevant EBM summaries')
for (id, xmlSummary) in highQuality:
htmlFile.addBodyElement(xmlSummary.htmlData)
htmlFile.writeFile(outputPath+'summaries.high.html', useSummaryFormat=True)