def calculate_precursors(calibrant): if data.format == 'raw': lastMS1 = data.lscan(lastMS1ScanName) lastMS1, calibrant = raw_scan_recalibration( lastMS1, calibrant) else: try: lastMS1 = data.scan(lastMS1ScanName, centroid=True) except NotImplementedError: lastMS1 = centroid_func(data.scan(lastMS1ScanName)) envelopes = peak_pick(lastMS1, tolerance=0.01, min_peaks=2, enforce_isotopic_ratios=True)[0] return sum([[(x[0][0], c) for x in xs] for c, xs in envelopes.items()], []), calibrant
def calculate_precursors(self): if self.data.format == 'raw': if self.long_ms1: lastMS1 = self.get_long_MS1(self.lastMS1ScanName) # Is calibration valid in this case? else: lastMS1 = self.data.lscan(self.lastMS1ScanName) lastMS1 = self.raw_scan_recalibration(lastMS1) else: try: lastMS1 = self.data.scan(self.lastMS1ScanName, centroid = True) except NotImplementedError: lastMS1 = centroid_func(self.data.scan(self.lastMS1ScanName)) envelopes = peak_pick(lastMS1, **self.deisoreduce_MS1_args)[0] self.possible_precursors = sum([[(x[0][0], c) for x in xs] for c, xs in envelopes.items()], [])
def get_long_MS1_byProfile(self, scanNum): # This seems to take a really long time. Profile-mode scans are heavy! ms1_index = bisect.bisect_left(self.ms1_list, scanNum) scannumbers = [self.ms1_list[ms1_index + i] for i in [-1, 0, 1] if ms1_index + i >= 0 and ms1_index + i < len(self.ms1_list)] ms1s = [] for scannum in scannumbers: if scannum in self.MS1_scan_batch: ms1s.append(self.MS1_scan_batch[scannum]) else: scan = self.data.scan(scannum, centroid = False) self.MS1_scan_batch[scannum] = scan ms1s.append(scan) to_del = [k for k in self.MS1_scan_batch if k <= scannumbers[0]] for scannum in to_del: del self.MS1_scan_batch[scannum] # MS1 profile scans aren't consistent across the entire MZ range, # apparently dependent upon whether there is signal in a particular # location; we don't want to cover the entire range evenly, since # this would expand the data and slow things down, but we also don't # want to miss/mis-assign signal that occurs in only one scan of the # batch. long_mzs = set() for b in ms1s: long_mzs.update(round(x[0], 4) for x in b) long_ms1 = [] inds = [0]*len(ms1s) for lmz in sorted(long_mzs): sumint = 0 for j in xrange(len(ms1s)): bat, ind = ms1s[j], inds[j] while ind < len(bat) and bat[ind][0] < lmz: sumint += bat[ind][1] inds[j] += 1 ind = inds[j] long_ms1.append((lmz, sumint)) return centroid_func(long_ms1)
def extract(datafile, outputfile=None, default_charge=2, centroid=True, scan_type=None, deisotope_and_reduce_charge=True, deisotope_and_reduce_args={}, min_mz=140, precursor_tolerance=0.005, isobaric_labels=None, label_tolerance=0.01): """ Converts a mzAPI-compatible data file to MGF. Writes only MS2 spectra where these can be determined, otherwise takes every spectrum in the file. Likewise writes the precursor charge and mass if these can be determined. deisotope_and_reduce_charge deisotopes and charge-reduces each MS2 spectrum, which generally improves results from peptide database search algorithms. However, it should be disabled for very low-resolution scans. """ # Currently doesn't compensate for injection time! Would be required in # order to deal with iTRAQ/TMT labels. from multiplierz.spectral_process import deisotope_reduce_scan, peak_pick from multiplierz.spectral_process import centroid as centroid_func # Distinct from 'centroid' argument. def _get_precursor(mz, possible_prec, charge): try: return min([ x for x in possible_prec if (charge == None or x[1] == charge) ], key=lambda x: abs(x[0] - mz)) except ValueError: return None, None if not outputfile: outputfile = datafile + '.mgf' if os.path.exists(outputfile): assert outputfile.lower().endswith('mgf'), ( "Overwriting a non-MGF file %s with " "the MGF extractor is probably a mistake." % outputfile) writer = MGF_Writer(outputfile) data = mzFile(datafile) scanInfo = data.scan_info() # Coerce that scanInfo be in order of time, so that for .WIFF files # we can still use the previous-MS1 method to look up precursor charges. scanInfo.sort(key=lambda x: x[0]) if datafile.lower().endswith('.raw'): # May also exist for WIFF? filters = dict(data.filters()) # For RAW files only, there's the option to filter by a given # scan type. (It would be more efficient in many cases to # actually split files in a single run, though.) if scan_type: scan_type = scan_type assert (scan_type.lower() in ['cid', 'hcd', 'etd', 'etdsa']), ("Invalid scan type %s, must be one" "of (CID, HCD, ETD, ETDSA).") % scan_type typestr = "@%s" % scan_type.lower() scanInfo = [ x for x in scanInfo if x[3] == 'MS1' or typestr in filters[x[0]] ] else: filters = None assert not scan_type, "Scan type filtering only enabled with .RAW format files." if isobaric_labels: assert centroid, "Isobaric tags can only be read from centroided data; set 'centroid' to True." if not isobaric_labels: labels = [] elif isobaric_labels == 4 or isobaric_labels == '4plex': labels = zip([114, 115, 116, 117], [114.11, 115.11, 116.11, 117.12]) elif isobaric_labels == 6 or isobaric_labels == '6plex': labels = zip([126, 127, 128, 129, 130, 131], [126.127, 127.131, 128.134, 129.138, 130.141, 131.138]) elif isobaric_labels == 8 or isobaric_labels == '8plex': labels = zip( [113, 114, 115, 116, 117, 118, 119, 121], [113.11, 114.11, 115.11, 116.11, 117.12, 118.12, 119.12, 121.12]) elif isobaric_labels == 10 or isobaric_labels == '10plex': labels = zip([ '126', '127N', '127C', '128N', '128C', '129N', '129C', '130N', '130C', '131' ], [ 126.127726, 127.124761, 127.131081, 128.128116, 128.134436, 129.131471, 129.137790, 130.134825, 130.141145, 131.138180 ]) assert label_tolerance < 0.005, ( "label_tolerance must be lower " "than 0.005 for 10-plex experiments! (Currently %s)" % label_tolerance) else: raise NotImplementedError, ("Labels of type %s not recognized.\n" "Should be one of [4,6,8,10] or None.") def read_labels(scan): partscan = [x for x in scan if x[0] < labels[-1][1] + 3] if not partscan: return dict([(str(l), '0') for l in zip(*labels)[0]]) # This should probably actually sum all points within # the tolerance range. scan_values = {} for label, mz in labels: nearpt = min(partscan, key=lambda x: abs(x[0] - mz)) if abs(nearpt[0] - mz) < label_tolerance: scan_values[str(label)] = '%.3f' % nearpt[1] else: scan_values[str(label)] = '0' # Report noise value? return scan_values inconsistent_precursors = 0 scans_written = 0 lastMS1 = None lastMS1ScanName = None recal_factor = 1 calibrant = RAW_CAL_MASS for time, mz, scanNum, scanLevel, scanMode in scanInfo: scanName = scanNum if isinstance(scanNum, int) else time if scanLevel == 'MS1': lastMS1ScanName = scanName possible_precursors = None def calculate_precursors(calibrant): if data.format == 'raw': lastMS1 = data.lscan(lastMS1ScanName) lastMS1, calibrant = raw_scan_recalibration( lastMS1, calibrant) else: try: lastMS1 = data.scan(lastMS1ScanName, centroid=True) except NotImplementedError: lastMS1 = centroid_func(data.scan(lastMS1ScanName)) envelopes = peak_pick(lastMS1, tolerance=0.01, min_peaks=2, enforce_isotopic_ratios=True)[0] return sum([[(x[0][0], c) for x in xs] for c, xs in envelopes.items()], []), calibrant continue elif scanLevel == 'MS3': continue elif lastMS1ScanName == None: continue # Each file type handles centroiding differently (or not at all.) if data.format == 'raw': scan = data.scan(scanName, centroid=centroid) scan, calibrant = raw_scan_recalibration(scan, calibrant) elif data.format == 'wiff': # explicit_numbering, of course, can't be active here. scan = data.scan(scanName) if centroid: scan = centroid_func(scan) elif data.format == 'd': scan = data.scan(scanName, centroid=centroid) if centroid and not scan: # mzAPI.D returns empty if centroid data is not present in # the file, but that can be corrected by external centroiding. scan = centroid_func(data.scan(scanName, centroid=False)) else: raise NotImplementedError, "Extractor does not handle type %s" % data.format if filters and not mz: mz = float(filters[time].split('@')[0].split(' ')[-1]) mzP = None chargeP = None if "scanPrecursor" in dir(data): assert isinstance(scanName, int) mzP, chargeP = data.scanPrecursor(scanName) if not mzP: # .scanPrecursor sometimes returns charge and not mzP. if possible_precursors == None: possible_precursors, calibrant = calculate_precursors( calibrant) mzP, chargeP = _get_precursor(mz, possible_precursors, chargeP) if not mzP: # Release presumed charge possibly obtained from scanPrecursor. mzP, chargeP = _get_precursor(mz, possible_precursors, None) if mz and chargeP: inconsistent_precursors += 1 if mzP and (abs(mz - mzP) < 2 or not mz): mz = mzP charge = chargeP else: charge = default_charge if not charge: charge = default_charge if not mz: import warnings errmgf = os.path.abspath(datafile) warnings.warn('Unable to recover all precursor masses from %s' % errmgf) else: if labels: scan_labels = read_labels(scan) else: scan_labels = {} title = standard_title_write(datafile, rt=time, mz=mz, mode=scanMode, scan=scanNum, **scan_labels) # Should expand extract() call to include arguments to this. if deisotope_and_reduce_charge and centroid: if ('tolerance' not in deisotope_and_reduce_args or not deisotope_and_reduce_args['tolerance']): deisotope_and_reduce_args[ 'tolerance'] = precursor_tolerance scan = deisotope_reduce_scan(scan, **deisotope_and_reduce_args) scan = [x for x in scan if x[0] > min_mz] assert charge, title writer.write(scan, title, mass=mz, charge=charge) scans_written += 1 writer.close if inconsistent_precursors: vprint("Precursor inconsistencies: %s/%s" % (inconsistent_precursors, scans_written)) return outputfile
def run(self): self.inconsistent_precursors = 0 self.scans_written = 0 self.lastMS1ScanName = None self.possible_precursors = None for time, mz, scanNum, scanLevel, scanMode in self.scanInfo: scanName = scanNum if isinstance(scanNum, int) else time if scanLevel == 'MS1': self.lastMS1ScanName = scanName self.possible_precursors = None continue elif scanLevel == 'MS3': continue elif self.lastMS1ScanName == None: continue # Each file type handles centroiding differently (or not at all.) if self.data.format == 'raw': scan = self.data.scan(scanName, centroid = self.centroid) scan = self.raw_scan_recalibration(scan) elif self.data.format == 'wiff': # explicit_numbering, of course, can't be active here. scan = self.data.scan(scanName) if self.centroid: scan = centroid_func(scan) elif self.data.format == 'd': scan = self.data.scan(scanName, centroid = self.centroid) if self.centroid and not scan: # mzAPI.D returns empty if centroid data is not present in # the file, but that can be corrected by external centroiding. scan = centroid_func(self.data.scan(scanName, centroid = False)) else: raise NotImplementedError, ("Extractor does not handle type %s" % self.data.format) if self.filters and not mz: mz = float(self.filters[time].split('@')[0].split(' ')[-1]) mzP = None chargeP = None if ("scanPrecursor" in dir(self.data) and self.derive_precursor_via in ['All', 'Thermo']): assert isinstance(scanName, int) mzP, chargeP = self.data.scanPrecursor(scanName) if (not mzP) or self.derive_precursor_via in ['Direct']: # 'and derive_precursor_via not in ['Thermo']', except why would you? # .scanPrecursor sometimes returns charge and not mzP. mzP, chargeP = self.get_precursor(mz, chargeP) if not mzP: # Release presumed charge possibly obtained from scanPrecursor. mzP, chargeP = self.get_precursor(mz, None) if mz and chargeP: self.inconsistent_precursors += 1 if mzP and (abs(mz - mzP) < 2 or not mz): mz = mzP charge = chargeP else: charge = self.default_charge if not charge: charge = self.default_charge if not mz: import warnings errmgf = os.path.abspath(self.filename) warnings.warn('Unable to recover all precursor masses from %s' % errmgf) else: if (self.maximum_mass and remove_protons(mz, charge) > self.maximum_mass): continue if self.labels: scan_labels = self.read_labels(scan) else: scan_labels = {} title = standard_title_write(self.filename, rt = time, mz = mz, mode = scanMode, scan = scanNum, **scan_labels) if self.deisoreduce and self.centroid: scan = deisotope_reduce_scan(scan, **self.deisoreduce_MS2_args) scan = [x for x in scan if x[0] > self.min_mz] assert charge, title yield scan, title, mz, charge self.scans_written += 1