forked from sripakpa/comorbidity-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
optimize_log_likelihood_class.py
403 lines (312 loc) · 14.9 KB
/
optimize_log_likelihood_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
#!/usr/bin/env python
import math
import time
import numpy as np
from scipy import optimize
from genetic_penetrance_class import GeneticPenetrance
from joint_age_of_onset_class import JointAgeOfOnset
from joint_final_age_class import JointFinalAge
from norm_prevalence_func import create_prval_norm_func
from log_likelihood_func import create_log_likelihood_func
from log_likelihood_func import create_log_likelihood_fprime
class OptimizeLogLikelihood(object):
"""OptimizeLogLikelihood class
Find the model's parameters (rho1, rho2 and rho12) that maximize the log-
likelihood of observing the EMR data according to the genetic penetrance
models proposed by Rzhetsky et al. 2007.
Parameters
----------
verbose : bool, optional
Verbose output.
opt_method : ['Nelder-Mead', 'Powell', 'CG', 'BFGS' (default)], optional
Name of method use to perform the optimization.
use_analytical_gradient : bool, optional
Analytically compute the gradient when using gradient-based
optimization methods (i.e. 'CG' and 'BFGS')
Attributes
----------
tau1 : integer
Model's parameter indicating the minimum number of deleterious mutation
in S1 or S12, the combined region of the genome that predisposes the
polymorphisms' bearers to disease D1
The tau2 values used in Rzhetsky et al. 2007 were tau1 = 1 or tau1 = 3.
tau2 : integer
Model's parameter indicating the minimum number of deleterious mutation
in S2 or S12, the combined region of the genome that predisposes the
polymorphisms' bearers to disease D2
The tau2 values used in Rzhetsky et al. 2007 were tau2 = 1 or tau2 = 3.
overlap_type: ["independent", "cooperation", "competition"]
Model's parameter describing the type of genetic overlap used in the
model
rng: np.random object
pseudo-random number generator.
log_likelihood_func : python function
A function to evaluate the log_likelihood value
as a function of rho1, rho2 and rho12
log_likelihood_fprime : python function
A function to evaluate the derivative of the log-likelihod wrt
to rho1, rho2 and rho12.
"""
def __init__(self, verbose=False):
"""Instantiate variables in LogLikelihood class."""
self.opt_method = "BFGS"
self.use_analytical_gradient = False
self.verbose = verbose
self.tau1 = None
self.tau2 = None
self.overlap_type = None
self.rng = np.random.RandomState(50) # fixed seed
self.log_likelihood_func = None
self.log_likelihood_fprime = None
def set_opt_method(self, opt_method):
"""Set optimization method name attribute"""
self.opt_method = opt_method
def set_use_analytical_gradient(self, use_analytical_gradient):
"""Set use_analytical_gradient attribute"""
def set_use_random_seed(self, use_random_seed):
"""Use fixed or random seed for pseudo-random number generator."""
# re-seed the pseudo-random number generator
if use_random_seed: self.rng.seed()
# Public methods
def get_log_likelihood_func(self):
if self.log_likelihood_func == None:
raise RuntimeError("log_likelihood_func not yet setup.")
return self.log_likelihood_func
def setup_log_likelihood_func(self,
database,
D1, D2,
tau1, tau2,
overlap_type,
threshold_type,
prevalence_file=None,
norm_prval_method=None):
"""Setup the log_likelihood function.
Parameters
----------
database : EMRDatabase class object
This class allow query of EMR data by diseases, gender, and
ethnicity citeria
D1 : string
Name of the first disease.
D2 : string
Name of the first disease.
tau1 : integer
Model's parameter indicating the minimum number of deleterious
mutation in S1 or S12.
tua2 : integer
Model's parameter indicating the minimum number of deleterious
mutation in S2 or S12.
overlap_type : ["independent", "cooperation", "competition"]
Model's parameter describing the type of genetic overlap used in
the model
threshold_type : ["sharp", "soft"]
Model's parameter describing the type of genetic penetrance used in
the model
prevalence_file : string
Location of the prevalence data file
norm_prval_method : [None, "rzhetsky", "max", "min",
"avg", "wts_avg", "sum"]
Protocol to normalize the prevalence data.
"""
# Query patients data
D1orD2_patients = database.query_emr_data([D1, D2], OR_match = True)
noD1D2_patients = database.query_emr_data(["not " + D1, "not " + D2])
# Conditional probabilities: P(phi(t) | phi(infty))
joint_age_of_onset = JointAgeOfOnset(D1orD2_patients)
joint_age_of_onset_funcs = joint_age_of_onset.get_funcs()
# Patient counts of as a distribution of the final age value.
joint_final_age = JointFinalAge(D1orD2_patients, noD1D2_patients)
final_age_array = joint_final_age.get_final_age_array()
patient_counts = joint_final_age.get_patient_counts()
# Setup the genetic_penetrance model to compute the age-integrated
# phenotype probabilities: P(phi(infty) ; rho1, rho2, rho12)
genetic_penetrance = GeneticPenetrance(tau1,
tau2,
overlap_type,
threshold_type)
# Create a function that will normalize the raw EMR data to match the
# general population disease prevalence.
if prevalence_file != None and norm_prval_method != None:
prval_norm_func = (
create_prval_norm_func(D1, D2,
database.get_disease2count(),
database.get_tot_patient_count(),
prevalence_file,
norm_prval_method))
else:
prval_norm_func = None
# Function to evaluate log-likelihood at given rho1, rho2 and rho12.
log_likelihood_func = (
create_log_likelihood_func(genetic_penetrance,
joint_age_of_onset_funcs,
patient_counts,
final_age_array,
prval_norm_func))
# Function to evaluate the derivative of the log-likelihod wrt
# to rho1, rho2 and rho12.
log_likelihood_fprime = (
create_log_likelihood_fprime(genetic_penetrance,
joint_age_of_onset_funcs,
patient_counts,
final_age_array,
prval_norm_func))
self.tau1 = tau1
self.tau2 = tau2
self.overlap_type = overlap_type
self.log_likelihood_func = log_likelihood_func
self.log_likelihood_fprime = log_likelihood_fprime
def run(self, save_path=False):
"""Optimize the log_likelihood function starting from a random points
in the parameter space.
Parameters
----------
save_path: bool, optional
save and return the coordinates along the optimization path.
Returns
-------
opt_log_likelihood: float
The log_likelihood value after optimization.
opt_param: a dictionary
Dictionary containing the coordinates after optimization.
The key-value pairs are:
"rho1" : value of rho1 (float)
"rho2" : value of rho2 (float)
"rho12" : value of rho12 (float)
"log_likelihood" : log_likelihood value (float)
paths : None or dictionary
If save_path is False, then path is None.
If save_path is True, then path is a dictionary storing the
coordinates at each iteration along the optimization path. The
key-value pairs are:
"rho1" : list of rho1 values.
"rho2" : list of rho2 values.
"rho12" : list of rho12 values.
"log_likelihood" : list of log_likelihood value.
The ith element of each list store the the value at the ith
optimization iteration step.
"""
if self.log_likelihood_func == None:
raise RuntimeError("log_likelihood_func not yet setup.")
if self.log_likelihood_fprime == None:
raise RuntimeError("log_likelihood_fprime not yet setup.")
rho1_max = 1.0 * self.tau1
rho2_max = 1.0 * self.tau2
rho12_max = (rho1_max + rho2_max) / 2.0
# uniform distribution over (0, 1]
(rho1, rho2, rho12) = self.rng.rand(3) + 1.0e-20
(rho1, rho2, rho12) = (rho1 * rho1_max,
rho2 * rho2_max,
rho12 * rho12_max)
if self.verbose:
print "Optimizing log_likelihood",
print "(tau1 = %d, tau2 = %d," % (self.tau1, self.tau2),
print "overlap_type = %s)" % self.overlap_type
print "start: rho1 = %.3f, rho2 = %.3f, rho12 = %.3f" % (rho1,
rho2,
rho12)
# The log_likelihood func is not defined for negative rhos. Therefore,
# optimize in the log_rho space instead of the rho space, since this
# ensure that the rhos themselves never become negative.
log_rho = True
log_rho1 = math.log(rho1)
log_rho2 = math.log(rho2)
log_rho12 = math.log(rho12)
log_likelihood_fprime = None
if self.use_analytical_gradient:
log_likelihood_fprime = self.log_likelihood_fprime
if self.verbose:
grad_error = (
optimize.check_grad(self.log_likelihood_func,
log_likelihood_fprime,
[log_rho1, log_rho2, log_rho12]))
print "gradient calc error= " , grad_error
minimize = self.__get_minimize_func()
sign = -1.0 # maximize instead of minimize.
start_time = time.time()
xopt, allvec = minimize(self.log_likelihood_func,
[log_rho1, log_rho2, log_rho12],
args=(log_rho, sign,),
fprime=log_likelihood_fprime)
if self.verbose:
print " " * 8,
print 'minimize_time = %.3f secs' % (time.time() - start_time)
print " " * 8,
print "rho1 = %.3f, rho2 = %.3f, rho12 = %.3f" % (np.exp(xopt[0]),
np.exp(xopt[1]),
np.exp(xopt[2]))
for n, point in enumerate(allvec):
# convert log_rhos back to rho
rho1 = np.exp(point[0])
rho2 = np.exp(point[1])
rho12 = np.exp(point[2])
if self.overlap_type == "independent": rho12 = 0.0
print " " * 8,
print "%3d: rho1, rho2, rho12, log_likelihood = " % n,
print "%.3f %.3f %.3f" % (rho1, rho2, rho12),
print "%.3f" % self.log_likelihood_func(point)
print
# convert log_rho back to rho
opt_rho1 = np.exp(xopt[0])
opt_rho2 = np.exp(xopt[1])
opt_rho12 = np.exp(xopt[2])
if self.overlap_type == "independent": opt_rho12 = 0.0
opt_param = dict()
opt_param["rho1"] = opt_rho1
opt_param["rho2"] = opt_rho2
opt_param["rho12"] = opt_rho12
opt_param["log_likelihood"] = self.log_likelihood_func(xopt)
if save_path:
path = {'rho1' : [], 'rho2': [], 'rho12': [], 'log_likelihood': []}
for n, point in enumerate(allvec):
# convert log_rhos back to rho
rho1 = np.exp(point[0])
rho2 = np.exp(point[1])
rho12 = np.exp(point[2])
if self.overlap_type == "independent": rho12 = 0.0
path['rho1'].append(rho1)
path['rho2'].append(rho2)
path['rho12'].append(rho12)
path['log_likelihood'].append(self.log_likelihood_func(point))
else:
path = None
opt_log_likelihood = opt_param["log_likelihood"]
return opt_log_likelihood, opt_param, path
# Private methods
def __get_minimize_func(self):
"""Select and return the specified optimization function.
Notes
------
Use this function to experiment with different values of gtol and ftol
gtol: float (scipy defualt is 1e-05)
Gradient tolerance, stop when norm of gradient is less than gtol.
ftol : float (scipy defualt is 0.0001)
Function tolerance, relative error in func(xopt) acceptable for
convergence.
"""
gtol_test = 0.001
ftol_test = 0.001
if self.opt_method == 'Nelder-Mead':
# Nelder-Mead is not a gradient-based method
def minimize(f, x0, args=(), fprime=None):
return optimize.fmin(f, x0, args=args, ftol=ftol_test,
retall=True, disp=self.verbose)
elif self.opt_method == 'Powell':
# Powell is not a gradient-based method
def minimize(f, x0, args=(), fprime=None):
return optimize.fmin_powell(f, x0, args=args, ftol=ftol_test,
retall=True, disp=self.verbose)
elif self.opt_method == 'CG':
# CG is a gradient-based method
def minimize(f, x0, args=(), fprime=None):
return optimize.fmin_cg(f, x0, args=args, gtol=gtol_test,
retall=True, disp=self.verbose,
fprime=fprime)
elif self.opt_method == 'BFGS':
# BFGS is a gradient-based descent method
def minimize(f, x0, args=(), fprime=None):
return optimize.fmin_bfgs(f, x0, args=args, gtol=gtol_test,
retall=True, disp=self.verbose,
fprime=fprime)
else:
raise ValueError("Unvalid opt_method %s" % opt_method)
return minimize