forked from p2/ClinicalTrialsNLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
runner.py
465 lines (366 loc) · 13.2 KB
/
runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# A class handling full data runs.
# Great profiling tool: pycallgraph
# pycallgraph graphviz -- ./mypythonscript.py
#
# 2013-05-09 Created by Pascal Pfiffner
#
import os
import logging
from threading import Thread
from ClinicalTrials.sqlite import SQLite
from ClinicalTrials.trial import Trial
from ClinicalTrials.lillycoi import LillyCOI
class Runner (object):
""" An instance of this class can perform data runs.
"""
runs = {}
@classmethod
def get(cls, run_id):
""" Returns the runner if we already have it, None otherwise. """
if run_id is None:
raise Exception("No run-id provided")
return cls.runs.get(run_id)
def __init__(self, run_id, run_dir):
if run_id is None:
raise Exception("No run-id provided")
self.run_id = run_id
self._name = None
self.run_dir = run_dir
self.sqlite_db = os.path.join(run_dir, 'runs.sqlite')
self.__class__.runs[run_id] = self
self.catch_exceptions = True # useful to turn off for debugging
self.nlp_pipelines = []
self.discard_cached = False # ignore cached codes
self.analyze_keypaths = None # set of keypaths (strings)
self.condition = None
self.term = None
self.reference_location = None # tuple (latitude, longitude)
self.limit = None
self._status = None
self._done = False
self.in_background = False
self.worker = None
# -------------------------------------------------------------------------- Running
def run(self, fields=None, callback=None):
""" Start running.
Arguments you can specify:
- fields: an array of field names that should be retrieved.
- callback: a callback function to be run at the end. The first argument
to the function will be a bool indicating whether the run was
successful, the second argument is the array of trials found during
the run.
"""
if self.in_background:
worker = Thread(target=self._run, kwargs={'fields': fields, 'callback': callback})
worker.start()
else:
self._run(fields, callback)
def _run(self, fields=None, callback=None):
""" Runs the whole toolchain.
Currently writes all status to a file associated with run_id. If the
first word in that file is "error", the process is assumed to have
stopped. If it is "done" the work here is done.
"""
# check prerequisites
if self.condition is None and self.term is None:
raise Exception("No 'condition' and no 'term' provided")
self.assure_run_directory()
self.status = "Searching for %s trials..." % (self.condition if self.condition is not None else self.term)
# anonymous callback for progress reporting
def cb(inst, progress):
if progress > 0:
self.status = "Fetching (%d%%)" % (100 * progress)
# make sure we retrieve the properties that we want to analyze
if self.analyze_keypaths:
if fields is None:
fields = []
fields.extend(self.analyze_keypaths)
fields.append('eligibility')
# start the search
self.status = "Fetching %s trials..." % (self.condition if self.condition is not None else self.term)
lilly = LillyCOI()
trials = []
if self.condition is not None:
trials = lilly.search_for_condition(self.condition, True, fields, cb)
else:
trials = lilly.search_for_term(self.term, True, fields, cb)
if self.limit and len(trials) > self.limit:
trials = trials[:self.limit]
# process found trials
self.status = "Processing..."
sqlite = SQLite.get(self.sqlite_db)
progress = 0
progress_tot = len(trials)
progress_each = max(5, progress_tot / 25)
ncts = []
num_nlp_trials = 0
nlp_to_run = set()
for trial in trials:
ncts.append(trial.nct)
trial.analyze_keypaths = self.analyze_keypaths
if self.catch_exceptions:
try:
trial.codify_analyzables(self.nlp_pipelines, self.discard_cached)
except Exception as e:
self.status = 'Error processing trial: %s' % e
return
else:
trial.codify_analyzables(self.nlp_pipelines, self.discard_cached)
trial.store()
self.write_trial(sqlite, trial)
# make sure we run the NLP pipeline if needed
to_run = trial.waiting_for_nlp(self.nlp_pipelines)
if len(to_run) > 0:
nlp_to_run.update(to_run)
num_nlp_trials = num_nlp_trials + 1
# progress
progress = progress + 1
if 0 == progress % progress_each:
self.status = "Processing (%d %%)" % (float(progress) / progress_tot * 100)
sqlite.commit()
# run the needed NLP pipelines
success = True
for nlp in self.nlp_pipelines:
if nlp.name in nlp_to_run:
self.status = "Running %s for %d trials (this may take a while)" % (nlp.name, num_nlp_trials)
if self.catch_exceptions:
try:
nlp.run()
except Exception as e:
self.status = "Running %s failed: %s" % (nlp.name, str(e))
success = False
break
else:
nlp.run()
# make sure we codified all criteria
if success:
for trial in trials:
trial.codify_analyzables(self.nlp_pipelines, False)
self.status = 'done'
# run the callback
if callback is not None:
callback(success, trials)
# -------------------------------------------------------------------------- NLP Pipelines
def add_pipeline(self, nlp_pipeline):
""" Add an NLP pipeline to the runner. """
# set root directory
nlp_pipeline.set_relative_root(self.run_dir)
# add to stack
if self.nlp_pipelines is None:
self.nlp_pipelines = []
self.nlp_pipelines.append(nlp_pipeline)
def add_pipelines(self, nlp_pipelines):
""" Add a bunch of NLP pipelines at once. """
for nlp in nlp_pipelines:
self.add_pipeline(nlp)
# -------------------------------------------------------------------------- Status
@property
def name(self):
if self._name is None:
self._name = "find '%s'" % (self.condition if self.condition is not None else self.term)
return self._name
@property
def status(self):
if self._status is None:
sqlite = SQLite.get(self.sqlite_db)
if not sqlite:
return None
stat_query = "SELECT status FROM runs WHERE run_id = ?"
res = sqlite.executeOne(stat_query, (self.run_id,))
self._status = res[0] if res and len(res) > 0 else 'unknown status'
return self._status
@status.setter
def status(self, status):
logging.info("%s: %s" % (self.name, status))
self._status = status
sqlite = SQLite.get(self.sqlite_db)
if sqlite:
stat_query = "UPDATE runs SET status = ? WHERE run_id = ?"
sqlite.executeUpdate(stat_query, (status, self.run_id))
sqlite.commit()
@property
def done(self):
return True if 'done' == self.status else False
# -------------------------------------------------------------------------- Results
def overview(self, restrict='reason'):
if not self.done:
raise Exception("Trial results are not yet available")
sqlite = SQLite.get(self.sqlite_db)
if sqlite is None:
raise Exception("No SQLite handle, please set up properly")
# collect intervention types and (drug) trial phases
types = {}
phases = {}
qry = "SELECT types, phases FROM trials WHERE run_id = ?"
if 'reason' == restrict:
qry += ' AND reason IS NULL'
for row in sqlite.execute(qry, (self.run_id,)):
if row[0]:
for tp in row[0].split('|'):
types[tp] = types[tp] + 1 if tp in types else 1
if row[1]:
for ph in row[1].split('|'):
phases[ph] = phases[ph] + 1 if ph in phases else 1
return {
'intervention_types': types,
'drug_phases': phases
}
def trial_phases(self, restrict='reason', filter_interventions=None):
""" Return a dict with the number of trials per phase after filtering
by intervention type. """
if not self.done:
raise Exception("Trial results are not yet available")
sqlite = SQLite.get(self.sqlite_db)
if sqlite is None:
raise Exception("No SQLite handle, please set up properly")
# collect (drug) trial phases
phases = {}
qry = "SELECT phases FROM trials WHERE run_id = ?"
if 'reason' == restrict:
qry += ' AND reason IS NULL'
# filter by interventions
if filter_interventions is not None:
ored = []
for inter in filter_interventions:
ored.append('types LIKE "%%%s%%"' % inter)
if len(ored) > 0:
qry = qry + ' AND (' + ' OR '.join(ored) + ')'
# execute query
for row in sqlite.execute(qry, (self.run_id,)):
if row[0]:
for ph in row[0].split('|'):
phases[ph] = phases[ph] + 1 if ph in phases else 1
return phases
def trials_json(self, restrict='reason', filter_interventions=None, filter_phases=None):
""" Returns an array of trial JSON for the matching trials, optionally
filtered by intervention type and/or drug phases.
"""
if not self.done:
raise Exception("Trial results are not yet available")
sqlite = SQLite.get(self.sqlite_db)
if sqlite is None:
raise Exception("No SQLite handle, please set up properly")
# look up trials. Currently cheaply filtering by string comparison
qry = "SELECT nct FROM trials WHERE run_id = ? AND reason IS NULL"
if 'reason' == restrict:
qry += ' AND reason IS NULL'
tpls = [self.run_id]
if filter_interventions is not None:
ored = []
for inter in filter_interventions:
ored.append('types LIKE "%%%s%%"' % inter)
# ored.append('instr(types, ?)')
# tpls.append(inter)
if len(ored) > 0:
qry = qry + ' AND (' + ' OR '.join(ored) + ')'
if filter_phases is not None:
ored = []
for phase in filter_phases:
ored.append('phases LIKE "%%%s%%"' % phase)
# ored.append('instr(phases, ?)')
# tpls.append(phase)
if len(ored) > 0:
qry = qry + ' AND (' + ' OR '.join(ored) + ')'
trials = []
fields = ['keyword', 'phase', 'overall_contact']
lat = float(self.reference_location[0]) if self.reference_location else 0
lng = float(self.reference_location[1]) if self.reference_location else 0
# retrieve ncts
qry += ' ORDER BY distance ASC'
for row in sqlite.execute(qry, tuple(tpls)):
trial = Trial(row[0])
trial.load()
trial_dict = trial.json(fields)
# add trial locations
if lat and lng:
closest = []
for loc in trial.locations_closest_to(lat, lng, open_only=True):
closest.append(loc[0].json())
trial_dict['location'] = closest
trials.append(trial_dict)
# grab trial data in batch from db - PROBLEM: distance order is not preserved
# for trial in Trial.retrieve(ncts):
# trials.append(trial.json(fields))
return trials
def write_trial(self, sqlite, trial):
""" Stores metadata about the given trial pertaining to the current run.
"""
if sqlite is None:
raise Exception("No SQLite handle, please set up properly")
# order by location
distance = 99999
if self.reference_location is not None:
lat = float(self.reference_location[0])
lng = float(self.reference_location[1])
closest = trial.locations_closest_to(lat, lng, limit=1, open_only=True)
if len(closest) > 0:
distance = closest[0][1]
nct_query = "INSERT INTO trials (run_id, nct, types, phases, distance) VALUES (?, ?, ?, ?, ?)"
sqlite.executeInsert(nct_query, (
self.run_id,
trial.nct,
'|'.join(trial.intervention_types),
'|'.join(trial.trial_phases),
distance
))
def write_trial_reason(self, nct, reason):
""" ONLY TEMPORARY!!! """
sqlite = SQLite.get(self.sqlite_db)
if sqlite is None:
raise Exception("No SQLite handle, please set up properly")
nct_query = "UPDATE trials SET reason = ? WHERE nct = ?"
sqlite.executeInsert(nct_query, (reason, nct))
def get_ncts(self, restrict='reason'):
""" Read the previously stored NCTs with their filtering reason (if any)
and return them as a list of tuples. """
sqlite = SQLite.get(self.sqlite_db)
if sqlite is None:
raise Exception("No SQLite handle, please set up properly")
ncts = []
nct_query = "SELECT nct, reason FROM trials WHERE run_id = ?"
if 'reason' == restrict:
nct_query += ' AND reason IS NULL'
for res in sqlite.execute(nct_query, (self.run_id,)):
ncts.append(res)
return ncts
def commit_transactions(self):
""" ONLY TEMPORARY in conjunction with write_trial_reason. """
sqlite = SQLite.get(self.sqlite_db)
if sqlite:
sqlite.commit()
# -------------------------------------------------------------------------- Run Directory
def assure_run_directory(self):
if self.run_dir is None:
raise Exception("No run directory defined for runner %s" % self.name)
# create our directory
if not os.path.exists(self.run_dir):
os.mkdir(self.run_dir)
if not os.path.exists(self.run_dir):
raise Exception("Failed to create run directory for runner %s" % self.name)
# create our SQLite table
sqlite = SQLite.get(self.sqlite_db)
sqlite.execute('PRAGMA foreign_keys = ON')
sqlite.create('runs', '''(
run_id VARCHAR UNIQUE,
date DATETIME DEFAULT CURRENT_TIMESTAMP,
status VARCHAR
)''')
sqlite.create('trials', '''(
run_id VARCHAR,
nct VARCHAR,
reason TEXT,
types VARCHAR,
phases VARCHAR,
distance INT,
UNIQUE (run_id, nct) ON CONFLICT REPLACE,
FOREIGN KEY (run_id) REFERENCES runs (run_id) ON DELETE CASCADE
)''')
stat_query = "INSERT OR IGNORE INTO runs (run_id, status) VALUES (?, ?)"
sqlite.executeInsert(stat_query, (self.run_id, 'initializing'))
# clean older than 6 hours
clean_qry = "DELETE FROM runs WHERE date < datetime('now', '-6 hours')"
sqlite.execute(clean_qry, ())
sqlite.commit()