forked from sripakpa/comorbidity-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
plot_joint_final_age.py
232 lines (165 loc) · 7.95 KB
/
plot_joint_final_age.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python
"""
SYNOPSIS
This script plots the count of patients as a distribution of their final
age value. The patients are seperated by their 2 disease phenotype status
and the count are computed seperately for each phenotype status.
DESCRIPTION
For each pair of diseases D1 and D2, there are four possible phenotype
status = {PHI0, PHI1, PHI2, PHI12} corresponding to
PHI0 = "affected by neither D1 or D2"
PHI1 = "affected by D1 but not D2"
PHI2 = "affected by D2 but not D1"
PHI12 = "affected by both D1 and D2"
The patients are seperated by their 2 disease phenotype status and the
counts distribution are plotted seperately for each phenotype status.
EXAMPLES
# Plot for specified disease pair.
python plot_joint_final_age.py -i data/test_EMR_dataset.csv \
--d1 "Breast cancer (female)" --d2 "Schizophrenia"
# Plot for all disease pairs with non-zero patient counts.
python plot_joint_final_age.py -i data/test_EMR_dataset.csv
AUTHOR
Parin Sripakdeevong <sripakpa@stanford.edu>"""
import sys
import time
import optparse
import numpy as np
import matplotlib.pyplot as plt
from emr_database_class import EMRDatabase
from joint_final_age_class import JointFinalAge
def main():
global options
database = EMRDatabase()
database.import_data(options.emr_data_file,
options.diseases_file,
options.code2disease_file)
diseases = database.get_diseases()
diseases.sort()
# Create plot for specified disease pair or for all disease pairs with
# non-zero patient counts.
if options.disease1 != None and options.disease2 != None:
patient_counts, final_age_array = __get_counts(database,
options.disease1,
options.disease2,
options.verbose)
plt = __plot_figure(final_age_array, patient_counts,
options.disease1, options.disease2,
options.normalize, options.verbose)
plt.show()
else:
filtered_diseases = __filter_diseases(diseases, database)
for D1 in filtered_diseases:
for D2 in filtered_diseases:
if D1 == D2: continue
patient_counts, final_age_array = __get_counts(database,
D1, D2,
options.verbose)
plt = __plot_figure(final_age_array, patient_counts, D1, D2,
options.normalize, options.verbose)
plt.close()
def __filter_diseases(diseases, database):
"""Filter for diseases where patients count for disease is non-zero."""
filtered_Ds = []
for D in diseases:
D_patients = database.query_emr_data(D)
if len(D_patients) != 0: filtered_Ds.append(D)
return filtered_Ds
def __get_counts(database, D1, D2, verbose):
"""Extract counts of patients as a distribution of the final age."""
# Patients with D1, D2 or both.
D1orD2_patients = database.query_emr_data([D1, D2], OR_match = True)
# Patients without D1 and without D2.
noD1D2_patients = database.query_emr_data(["not " + D1, "not " + D2])
# Extract counts of patients as a distribution of the final
# age value, seperately for each of 4 disease phenotypes.
joint_final_age = JointFinalAge(D1orD2_patients, noD1D2_patients, verbose)
final_age_array = joint_final_age.get_final_age_array()
patient_counts = joint_final_age.get_patient_counts()
return (patient_counts, final_age_array)
def __plot_figure(final_age_array, patient_counts, D1, D2, normalize, verbose):
"""Plot and save the figure with pyplot."""
sum_counts = np.sum(patient_counts, axis=1)
if normalize:
for index, sum_count in enumerate(sum_counts):
# Make sure sum_count is not zero.
if sum_count == 0: sum_count = 1
patient_counts[index] = patient_counts[index] / sum_count
plt.clf() # clear the current figure.
plt.plot(final_age_array, patient_counts[0],'mx') # PHI0
plt.plot(final_age_array, patient_counts[1],'rs') # PHI1
plt.plot(final_age_array, patient_counts[2],'g^') # PHI2
plt.plot(final_age_array, patient_counts[3],'bo') # PHI12
plt.title('Patients Distribution for %s and %s' % (D1, D2),
fontsize='large')
y_max = 1.4* max(np.max(patient_counts[0]), np.max(patient_counts[1]),
np.max(patient_counts[2]), np.max(patient_counts[3]))
plt.xlim([0.00, 101.00])
plt.ylim([0.00, y_max])
plt.xlabel('Patient\'s Final Age', fontsize='large')
if normalize:
plt.ylabel('Normalized Patient Counts', fontsize='large')
else:
plt.ylabel('Raw Patient Counts', fontsize='large')
plt.legend(['no %s and no %s (total= %d)' % (D1, D2, sum_counts[0]),
'only %s (total= %d)' % (D1, sum_counts[1]),
'only %s (total= %d)' % (D2, sum_counts[2]),
'both %s and %s (total= %d)' % (D1, D2, sum_counts[3])],
loc='best',
prop={'size':10})
# Save the plot as a png image.
fig_name = "joint_final_age_%s_and_%s.png" % (D1, D2)
fig_name = fig_name.replace(" ", "_")
if verbose:
print "saving %s to file" % fig_name
plt.savefig(fig_name, format='png')
return plt
if __name__ == '__main__':
start_time = time.time()
usage = """python plot_joint_final_age.py -i <emr_data_file>
--d1 <1st_disease_name> --d2 <2nd_disease_name>"""
parser = optparse.OptionParser(usage=usage + globals()['__doc__'])
parser.add_option("-i", "--emr_data",
action="store", type="string", dest="emr_data_file",
help="RAW EMR data file.")
parser.add_option("--d1", default=None, action="store", type="string",
dest="disease1",
help="name of the first disease.")
parser.add_option("--d2", default=None, action="store", type="string",
dest="disease2",
help="name of the second disease.")
parser.add_option("--no_normalize", action="store_false", default=True,
dest="normalize",
help="don't normalize by the total counts.")
parser.add_option("--diseases_file", action="store",
default="data/disease-list.txt", type="string",
dest="diseases_file",
help="file contain list of diseases.")
parser.add_option("--code2disease_file", action="store",
default="data/code-to-disease.csv", type="string",
dest="code2disease_file",
help="CSV file mapping ICD9 code to disease.")
parser.add_option("-v", "--verbose", action="store_true", default=False,
dest="verbose", help="verbose output.")
parser.add_option("--no_verbose", action="store_false", default=False,
dest="verbose", help="no verbose output.")
(options, args) = parser.parse_args()
if len(args) != 0:
parser.error("leftover arguments=%s" % args)
if not options.emr_data_file:
parser.error("option -i required")
if options.verbose:
print "-" * 50
print time.asctime()
print "disease1: %s" % options.disease1
print "disease2: %s" % options.disease2
print "normalize: %s" % options.normalize
print "emr_data_file: %s" % options.emr_data_file
print "diseases_file: %s" % options.diseases_file
print "code2disease_file: %s" % options.code2disease_file
print "-" * 50
main()
if options.verbose:
print time.asctime(),
print ' | total_time = %.3f secs' % (time.time() - start_time)
sys.exit(0)