forked from pandegroup/vs-utils
-
Notifications
You must be signed in to change notification settings - Fork 1
/
__init__.py
428 lines (374 loc) · 11.4 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
"""
Utilities for public_data.
"""
import gzip
try:
import ujson as json
except ImportError:
import json
import numpy as np
import pandas as pd
import warnings
def read_json(filename):
"""
Read a JSON file.
Parameters
----------
filename : str
Filename. Must be of type .json or .json.gz.
"""
if filename.endswith('json.gz'):
with gzip.open(filename) as f:
tree = json.load(f)
elif filename.endswith('.json'):
with open(filename) as f:
tree = json.load(f)
else:
raise ValueError('Filename must be of type .json or .json.gz.')
return tree
def read_sid_cid_map(filename):
"""
Read SID->CID map.
Parameters
----------
filename : str
SID->CID map.
"""
if filename.endswith('.gz'):
f = gzip.open(filename)
else:
f = open(filename)
try:
sid_cid = {}
for line in f:
sid, cid = line.split()
assert int(sid) not in sid_cid
sid_cid[int(sid)] = int(cid)
return sid_cid
finally:
f.close()
class PcbaJsonParser(object):
"""
Parser for PubChemBioAssay JSON.
Parameters
----------
filename : str
Filename.
"""
def __init__(self, filename):
self.tree = read_json(filename)
self.data = None
# move in to the assay description
try:
# FTP format
self.root = self.tree['PC_AssaySubmit']['assay']['descr']
except KeyError:
# REST format
# should just be one record per file
assert len(self.tree['PC_AssayContainer']) == 1
self.root = self.tree['PC_AssayContainer'][0]['assay']['descr']
def get_name(self):
"""
Get assay name.
"""
return self.root['name']
def get_aid(self):
"""
Get assay AID.
"""
return self.root["aid"]["id"]
def get_activity_outcome_method(self):
"""
Get activity outcome method.
"""
#
if "activity_outcome_method" in self.root:
method = self.root["activity_outcome_method"]
if "counter" in self.get_name().lower():
method = "counterscreen"
return method
else:
return None
def get_description(self):
"""
Get assay description.
"""
if isinstance(self.root['description'], list):
return '\n'.join(
[line.strip() for line in self.root['description']])
else:
return self.root['description']
def get_protocol(self):
"""
Get assay protocol.
"""
if isinstance(self.root['protocol'], list):
return '\n'.join([line.strip() for line in self.root['protocol']])
else:
return self.root['protocol']
def get_target(self):
"""
Get assay target.
TODO: Decide which fields are important. We may be able to match
targets by mol-id.
Returns
-------
target : dict
A dictionary containing keys for target information types, such
as 'name', 'mol-id', and 'molecule-type'.
"""
if 'target' in self.root:
return self.root['target']
else:
return None
def get_comment(self):
"""
Get assay comment.
"""
if "comment" in self.root:
if isinstance(self.root["comment"], list):
return "\n".join([line.strip() for line in self.root["comment"]])
else:
return self.root["comment"]
else:
return None
def get_results(self):
"""
Get Assay result fields.
"""
if "results" in self.root:
return self.root["results"]
else:
return None
def get_revision(self):
"""
Get assay revision.
"""
if "revision" in self.root:
return self.root["revision"]
else:
return None
def get_result_names(self, from_tid=False):
"""
Get column names for assay-specific result fields.
Parameters
----------
from_tid : bool, optional (default False)
Return a dict mapping TIDs to field names. If False, returns a list of
field names.
"""
tids = {}
names = []
for field in self.get_results():
name = field['name'].strip() # clean up extra whitespace
if name in names:
warnings.warn(
'Duplicated field "{}" in AID {}'.format(name, self.get_aid()))
tids[field['tid']] = name
names.append(name)
if from_tid:
return tids
else:
return names
def get_data(self):
"""
Get assay data in a Pandas dataframe.
"""
if self.data is not None:
return self.data
try:
data = self.tree['PC_AssaySubmit']['data']
except KeyError:
return None
# populate dataframe
tids = self.get_result_names(from_tid=True)
series = []
for dp in data:
point = {}
for key, value in dp.iteritems():
if key == 'data': # assay-specific fields
for col in value:
col_name = tids[col['tid']]
assert len(col['value']) == 1
for col_value in col['value'].itervalues():
point[col_name] = col_value
else: # generic fields
point[key] = value
series.append(point)
df = pd.DataFrame(series)
# add missing columns filled with null values
for col in tids.itervalues():
if col not in df.columns:
df.loc[:, col] = None
assert len(df) == len(data)
self.data = df
return df
def get_selected_data(self, column_mapping, with_aid=False, phenotype=None):
"""
Get a subset of the assay data.
Parameters
----------
column_mapping : dict
Mapping between output dataframe column names and assay data column names
from which to extract data (e.g. 'potency': 'EC50'). Values that are not
found in the assay data columns will be treated as constants.
with_aid : bool, optional (default False)
Annotate each data point with the AID for this assay.
phenotype : str, optional
Default phenotype for non-inactive data points (e.g., 'inhibitor').
Returns
-------
A pandas dataframe containing the selected assay data.
"""
# make a copy of the column mapping
column_mapping = column_mapping.copy()
# add standard PCBA columns
column_mapping['sid'] = 'sid'
column_mapping['outcome'] = 'outcome'
# add optional columns
if with_aid:
column_mapping['aid'] = self.get_aid()
elif 'aid' in column_mapping:
warnings.warn('column_mapping contains "aid"')
# get selected columns
data = self.get_data()
if data is None:
return None
old_cols, new_cols = [], []
constants = {}
for new_col, old_col in column_mapping.iteritems():
if old_col not in data.columns:
constants[new_col] = old_col
else:
new_cols.append(new_col)
old_cols.append(old_col)
df = data[old_cols] # get selected columns from assay data
df.columns = new_cols # rename columns to match column_mapping
for new_col, value in constants.iteritems():
df.insert(0, new_col, value) # add constant-valued columns
# process phenotypes
if phenotype is not None and 'phenotype' not in column_mapping:
df.insert(len(df.columns), 'phenotype', phenotype)
df.loc[df['outcome'] == 'inactive', 'phenotype'] = 'inactive'
return df
class PcbaPandasHandler(object):
"""
Writes data from PCBA into pandas dataframes.
Parameters
----------
"""
def __init__(self):
self.index = 0
self.df = pd.DataFrame(
columns=["name", "aid", "activity_outcome_method",
"description", "comment", "results", "revision"])
self.df['aid'] = self.df['aid'].astype(int) # force AID to int
def add_dataset(self, filename):
"""
Adds dataset to internal dataframe.
"""
parser = PcbaJsonParser(filename)
row = {}
row["name"] = parser.get_name()
row["aid"] = parser.get_aid()
row["activity_outcome_method"] = parser.get_activity_outcome_method()
row["description"] = parser.get_description()
row["comment"] = parser.get_comment()
row["results"] = parser.get_results()
row["revision"] = parser.get_revision()
self.df.loc[self.index] = pd.Series(row)
self.index += 1 # increment index
def get_dataset(self, index):
"""
Fetches information for a particular dataset by index.
"""
return self.df.loc[index]
def to_csv(self, out):
"""
Writes internal dataframe to provided location as csv.
"""
self.df.to_csv(out)
class PcbaDataExtractor(object):
"""
Extract selected data from PCBA assay data.
Parameters
----------
filename : str
PCBA JSON assay data file.
config : dict or pd.Series
Mapping between output dataframe column names and assay data column names
from which to extract data (e.g. 'potency': 'EC50'). Values that are not
found in the assay data columns will be treated as constants.
with_aid : bool, optional (default False)
Annotate each data point with the AID for this assay.
"""
def __init__(self, filename, config, with_aid=True):
self.filename = filename
self.parser = PcbaJsonParser(filename)
self.with_aid = with_aid
self.config = config
self.phenotype = None # default phenotype for this assay
self._check_config() # check configuration
def get_data(self, lower=True, sid_cid=None):
"""
Get selected data from the assay.
Parameters
----------
lower : bool, optional (default True)
Lowercase string fields for consistency.
sid_cid : dict, optional
SID->CID mapping. If provided, adds a 'cid' column to the dataframe.
"""
data = self.parser.get_selected_data(self.config, with_aid=self.with_aid,
phenotype=self.phenotype)
# map SIDs to CIDs
if sid_cid is not None:
cids = [sid_cid.get(sid) for sid in data['sid'].values]
data.loc[:, 'cid'] = pd.Series(cids, index=data.index)
# lowercase string fields for consistency
if lower:
for col, dtype in data.dtypes.iteritems():
if dtype == np.dtype('object'):
data.loc[:, col] = data[col].str.lower()
return data
def _check_config(self):
"""
Check the column mapping and other configuration parameters.
"""
# make a copy of the config
config = self.config.copy()
# remove null columns
for key, value in config.iteritems():
if pd.isnull(value):
del config[key]
# check for some common column names
columns = self.parser.get_result_names()
if 'Potency' in columns:
config['potency'] = 'Potency'
if 'Efficacy' in columns:
config['efficacy'] = 'Efficacy'
if 'Phenotype' in columns:
config['phenotype'] = 'Phenotype'
# target cleanup
# add GI prefix to integer targets
if 'target' in config:
try:
int(config['target'])
config['target'] = 'GI{}'.format(config['target'])
except ValueError:
pass
# phenotype handling
# either a column name or a default annotation
phenotypes = {'in': 'inhibitor', 'ac': 'activator', 'ag': 'activator',
'an': 'inhibitor', 'ia': 'inhibitor', 'pam': 'PAM',
'nam': 'NAM', '?': None}
phenotype = config.get('phenotype')
if phenotype and phenotype not in columns:
del config['phenotype'] # remove from column mapping
if phenotype in phenotypes.iterkeys():
phenotype = phenotypes[phenotype] # map to full name
if phenotype not in phenotypes.itervalues():
raise NotImplementedError(
'Unrecognized phenotype "{}"'.format(phenotype))
self.phenotype = phenotype # set default phenotype for this assay
self.config = config