forked from iracooke/proteomics-datatypes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
proteomics.py
421 lines (332 loc) · 14.1 KB
/
proteomics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
"""
Proteomics format classes
"""
import logging
import re
import binascii
from galaxy.datatypes import data
from galaxy.datatypes.data import Text
from galaxy.datatypes.xml import GenericXml
from galaxy.datatypes.binary import Binary
from galaxy.datatypes.tabular import Tabular
log = logging.getLogger(__name__)
class Wiff(Binary):
"""Class for wiff files."""
file_ext = 'wiff'
allow_datatype_change = False
composite_type = 'auto_primary_file'
def __init__(self, **kwd):
Binary.__init__(self, **kwd)
self.add_composite_file(
'wiff',
description='AB SCIEX files in .wiff format. This can contain all needed information or only metadata.',
is_binary=True)
self.add_composite_file(
'wiff_scan',
description='AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.',
optional='True', is_binary=True)
def generate_primary_file(self, dataset=None):
rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>']
rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
for composite_name, composite_file in self.get_composite_files(dataset=dataset).iteritems():
fn = composite_name
opt_text = ''
if composite_file.optional:
opt_text = ' (optional)'
if composite_file.get('description'):
rval.append('<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % (fn, fn, composite_file.get('description'), opt_text))
else:
rval.append('<li><a href="%s" type="text/plain">%s</a>%s</li>' % (fn, fn, opt_text))
rval.append('</ul></div></html>')
return "\n".join(rval)
if hasattr(Binary, 'register_unsniffable_binary_ext'):
Binary.register_unsniffable_binary_ext('wiff')
class IdpDB(Binary):
file_ext = "idpDB"
if hasattr(Binary, 'register_unsniffable_binary_ext'):
Binary.register_unsniffable_binary_ext('idpDB')
class PepXmlReport(Tabular):
"""pepxml converted to tabular report"""
file_ext = "tsv"
def __init__(self, **kwd):
Tabular.__init__(self, **kwd)
self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
def display_peek(self, dataset):
"""Returns formated html of peek"""
return Tabular.make_html_table(self, dataset, column_names=self.column_names)
class ProtXmlReport(Tabular):
"""protxml converted to tabular report"""
file_ext = "tsv"
comment_lines = 1
def __init__(self, **kwd):
Tabular.__init__(self, **kwd)
self.column_names = [
"Entry Number", "Group Probability",
"Protein", "Protein Link", "Protein Probability",
"Percent Coverage", "Number of Unique Peptides",
"Total Independent Spectra", "Percent Share of Spectrum ID's",
"Description", "Protein Molecular Weight", "Protein Length",
"Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge",
"Peptide sequence", "Peptide Link", "NSP Adjusted Probability",
"Initial Probability", "Number of Total Termini",
"Number of Sibling Peptides Bin", "Number of Instances",
"Peptide Group Designator", "Is Evidence?"]
def display_peek(self, dataset):
"""Returns formated html of peek"""
return Tabular.make_html_table(self, dataset, column_names=self.column_names)
class ProteomicsXml(GenericXml):
""" An enhanced XML datatype used to reuse code across several
proteomic/mass-spec datatypes. """
def sniff(self, filename):
""" Determines whether the file is the correct XML type. """
with open(filename, 'r') as contents:
while True:
line = contents.readline()
if line is None or not line.startswith('<?'):
break
# pattern match <root or <ns:root for any ns string
pattern = '^<(\w*:)?%s' % self.root
return line is not None and re.match(pattern, line) is not None
def set_peek(self, dataset, is_multi_byte=False):
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.blurb = self.blurb
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
class PepXml(ProteomicsXml):
"""pepXML data"""
file_ext = "pepxml"
blurb = 'pepXML data'
root = "msms_pipeline_analysis"
class MzML(ProteomicsXml):
"""mzML data"""
file_ext = "mzml"
edam_format = "format_3244"
blurb = 'mzML Mass Spectrometry data'
root = "(mzML|indexedmzML)"
class ProtXML(ProteomicsXml):
"""protXML data"""
file_ext = "protxml"
blurb = 'prot XML Search Results'
root = "protein_summary"
class MzXML(ProteomicsXml):
"""mzXML data"""
file_ext = "mzxml"
blurb = "mzXML Mass Spectrometry data"
root = "mzXML"
class MzIdentML(ProteomicsXml):
file_ext = "mzid"
edam_format = "format_3247"
blurb = "XML identified peptides and proteins."
root = "MzIdentML"
class TraML(ProteomicsXml):
file_ext = "traml"
edam_format = "format_3246"
blurb = "TraML transition list"
root = "TraML"
class MzQuantML(ProteomicsXml):
file_ext = "mzq"
edam_format = "format_3248"
blurb = "XML quantification data"
root = "MzQuantML"
class ConsensusXML(ProteomicsXml):
file_ext = "consensusxml"
blurb = "OpenMS multiple LC-MS map alignment file"
root = "consensusXML"
class FeatureXML(ProteomicsXml):
file_ext = "featurexml"
blurb = "OpenMS feature file"
root = "featureMap"
class IdXML(ProteomicsXml):
file_ext = "idxml"
blurb = "OpenMS identification file"
root = "IdXML"
class TandemXML(ProteomicsXml):
file_ext = "tandem"
blurb = "X!Tandem search results file"
root = "bioml"
class Mgf(Text):
"""Mascot Generic Format data"""
file_ext = "mgf"
def set_peek(self, dataset, is_multi_byte=False):
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.blurb = 'mgf Mascot Generic Format'
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
def sniff(self, filename):
mgf_begin_ions = "BEGIN IONS"
max_lines = 100
with open(filename) as handle:
for i, line in enumerate(handle):
line = line.rstrip()
if line == mgf_begin_ions:
return True
if i > max_lines:
return False
class MascotDat(Text):
"""Mascot search results """
file_ext = "mascotdat"
def set_peek(self, dataset, is_multi_byte=False):
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.blurb = 'mascotdat Mascot Search Results'
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
def sniff(self, filename):
mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
max_lines = 10
with open(filename) as handle:
for i, line in enumerate(handle):
line = line.rstrip()
if line == mime_version:
return True
if i > max_lines:
return False
class RAW(Binary):
"""Class describing a Thermo Finnigan binary RAW file"""
file_ext = "raw"
def sniff(self, filename):
# Thermo Finnigan RAW format is proprietary and hence not well documented.
# Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
# This combination represents 17 bytes, but to play safe we read 20 bytes from
# the start of the file.
try:
header = open(filename).read(20)
hexheader = binascii.b2a_hex(header)
finnigan = binascii.hexlify('F\0i\0n\0n\0i\0g\0a\0n')
if hexheader.find(finnigan) != -1:
return True
return False
except:
return False
def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = "Thermo Finnigan RAW file"
dataset.blurb = data.nice_size(dataset.get_size())
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
def display_peek(self, dataset):
try:
return dataset.peek
except:
return "Thermo Finnigan RAW file (%s)" % (data.nice_size(dataset.get_size()))
if hasattr(Binary, 'register_sniffable_binary_format'):
Binary.register_sniffable_binary_format('raw', 'raw', RAW)
class Msp(Text):
""" Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
file_ext = "msp"
@staticmethod
def next_line_starts_with(contents, prefix):
next_line = contents.readline()
return next_line is not None and next_line.startswith(prefix)
def sniff(self, filename):
""" Determines whether the file is a NIST MSP output file.
>>> fname = get_test_fname('test.msp')
>>> Msp().sniff(fname)
True
>>> fname = get_test_fname('test.mzXML')
>>> Msp().sniff(fname)
False
"""
with open(filename, 'r') as contents:
return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
class SPLibNoIndex( Text ):
"""SPlib without index file """
file_ext = "splib"
def set_peek( self, dataset, is_multi_byte=False ):
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
dataset.blurb = 'Spectral Library without index files'
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
class SPLib(Msp):
"""SpectraST Spectral Library. Closely related to msp format"""
file_ext = "splib"
composite_type = 'auto_primary_file'
def __init__(self, **kwd):
Msp.__init__(self, **kwd)
self.add_composite_file('library.splib',
description='Spectral Library. Contains actual library spectra',
is_binary=False)
self.add_composite_file('library.spidx',
description='Spectrum index', is_binary=False)
self.add_composite_file('library.pepidx',
description='Peptide index', is_binary=False)
def generate_primary_file(self, dataset=None):
rval = ['<html><head><title>Spectral Library Composite Dataset </title></head><p/>']
rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
for composite_name, composite_file in self.get_composite_files(dataset=dataset).iteritems():
fn = composite_name
opt_text = ''
if composite_file.optional:
opt_text = ' (optional)'
if composite_file.get('description'):
rval.append('<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % (fn, fn, composite_file.get('description'), opt_text))
else:
rval.append('<li><a href="%s" type="text/plain">%s</a>%s</li>' % (fn, fn, opt_text))
rval.append('</ul></div></html>')
return "\n".join(rval)
def set_peek(self, dataset, is_multi_byte=False):
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.blurb = 'splib Spectral Library Format'
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
def sniff(self, filename):
""" Determines whether the file is a SpectraST generated file.
"""
with open(filename, 'r') as contents:
return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "LibID:")
class Ms2(Text):
file_ext = "ms2"
def sniff(self, filename):
""" Determines whether the file is a valid ms2 file.
>>> fname = get_test_fname('test.msp')
>>> Ms2().sniff(fname)
False
>>> fname = get_test_fname('test.ms2')
>>> Ms2().sniff(fname)
True
"""
with open(filename, 'r') as contents:
header_lines = []
while True:
line = contents.readline()
if line is None or len(line) == 0:
pass
elif line.startswith('H\t'):
header_lines.append(line)
else:
break
for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
found_header = False
for header_line in header_lines:
if header_line.startswith('H\t%s' % (header_field)):
found_header = True
break
if not found_header:
return False
return True
# unsniffable binary format, should do something about this
class XHunterAslFormat(Binary):
""" Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
file_ext = "hlf"
if hasattr(Binary, 'register_unsniffable_binary_ext'):
Binary.register_unsniffable_binary_ext('hlf')
class Sf3(Binary):
"""Class describing a Scaffold SF3 files"""
file_ext = "sf3"
if hasattr(Binary, 'register_unsniffable_binary_ext'):
Binary.register_unsniffable_binary_ext('sf3')