def write(self, scan, title, mass, charge=None): self.file.write('BEGIN IONS\n') self.file.write('TITLE=%s\n' % title) if charge: # Don't even bother with 0 charge, either. charge = str(int(charge)) self.file.write('CHARGE=%s\n' % charge) if mass: # 0 mass gives errors, and is usually due to null values. self.file.write('PEPMASS=%s\n' % mass) if len(scan) >= 10000: vprint("Scan at %s has %d datapoints, Mascot only allows " "10000; removing least-intense points." % (title, len(scan))) scan.sort(key=lambda x: x[1], reverse=True) scan = scan[:9999] scan.sort(key=lambda x: x[0]) for pt in scan: if len(pt) == 2: self.file.write("%s\t%s\n" % (pt[0], pt[1])) elif len(pt) >= 3: self.file.write("%s\t%s\t%s\n" % (pt[0], pt[1], pt[2])) else: raise ValueError( "Scan datapoints must have both MZ and intensity!") self.file.write("END IONS\n")
def parse_to_generator(mgffile, labelType=(lambda x: x), header=False, rawStrings=False): """ Loads a Mascot Generic Format file and returns it in dict form. labelType can be a callable object that transforms the 'TITLE=' value of an MGF entry into what will be used for the corresponding key in the dict. If this is unspecified, the key will be the whole TITLE value. If "header" is set to True the output dict has an extra entry of key 'header' that contains the MGF header info. If raw_strings is set to True, the charge, pepmass, etc are returned as """ f = open(mgffile, "r") topMatter = True for line in f: if "BEGIN IONS" in line: topMatter = False entry = {} key = None rt = charge = mass = None spectrum = [] for line in f: if 'END IONS' in line: break elif '=' in line: field, value = line.split('=') if (not rawStrings) and field == 'CHARGE': value = int(value.strip('\n\r+ ')) elif (not rawStrings) and field == "PEPMASS": value = float(value.split()[0].strip()) else: value = value.strip() entry[field.strip().lower()] = value if field == 'TITLE': key = value.strip() elif any(line): spectrum.append(tuple(map(float, line.split()))) entry["spectrum"] = spectrum key = labelType(key) yield entry elif topMatter and '=' in line: field, value = line.split('=') if field in MGFTopMatter: #data['header'][field.lower()] = value continue elif 'SEARCH=' in line or 'MASS=' in line: continue else: vprint("Unexpected line: %s" % line)
def __init__(self, data_file): self.datafile = data_file #self.source = CreateObject("{7e3450b1-75e7-49b2-9be7-64cbb2458c56}") self.source = Dispatch("{7e3450b1-75e7-49b2-9be7-64cbb2458c56}") self.source.OpenFile(data_file) cole, level, _, _, desc, explevel, exppar = self.source.GetInfo descwords = desc.split() filename = descwords[2].strip(',') if filename.split('_')[1].upper() == 'MSMS': level = 2 precursor = float(filename.split('_')[2]) elif filename.split('_')[1].upper() == 'MS': level = 1 precursor = '-' else: vprint( "Unparsable T2D file name; MS level and precursor mass unavailable." ) level = -1 mzrange = float(descwords[-3].strip('(')), float( descwords[-1].strip(')')) self._info = { 'Collision Energy': float(cole), 'MS Level': int(level), 'Precursor': float(precursor) if precursor != '-' else None, 'Range': mzrange }
def __init__(self, filename): self.filename = filename assert os.path.exists(filename) if filename.lower().endswith('featurepickle'): vprint("Legacy mode enabled.") self.data = pickle.load(open(filename)) self.mode = 'pickle' else: self.connection = sqlite3.connect(filename) self.data = self.connection.cursor() self.mode = 'sql' self.decoder = newMarshal
def filters(self): """ Thermo-style filter strings for all spectra; used for compatibility with various legacy functions. """ ionization = self.source.MSScanFileInformation.IonModes if not ionization: vprint( "Could not determine separation/ionization; defaulting to GCMS." ) separator = 'GC' elif ionization & (4 | 2): # Bitwise OR and AND. separator = 'GC' else: separator = 'TOF' colEs = self.source.MSScanFileInformation.CollisionEnergy if len(colEs) == 1: colE = colEs[0] else: colE = None if not self._filters: self._filters = [] for rt, mz, index, level, polarity in self.scan_info(): scanObj = self.source.GetSpectrum_6(index) rangeobj = scanObj.MeasuredMassRange.QueryInterface( bc.IRange) # Yep, definitely spectrum-specific. if colE: # Singular collision energy in the file. energy = colE else: energy = float(scanObj.CollisionEnergy) if level != 'MS1': precstr = '%.4f@%.2f' % (mz, energy) else: precstr = '' string = "%s MS %s NSI Full ms%s %s[%.2f-%.2f]" % ( separator, polarity, int(level[2]) if level != 'MS1' else '', precstr, (rangeobj.Start), (rangeobj.End)) self._filters.append((rt, string)) return self._filters
def __init__(self, filename): self.filename = filename assert os.path.exists(filename) if filename.lower().endswith('featurepickle'): vprint("Legacy mode enabled.") self.data = pickle.load(open(filename)) self.mode = 'pickle' else: self.connection = sqlite3.connect(filename) self.data = self.connection.cursor() self.mode = 'sql' self.data.execute("SELECT data FROM features WHERE ind=1") testfeature = str(self.data.fetchone()[0]) if '\n' in testfeature: # Old non-base64 encoded feature file! self.decoder = oldMarshal else: self.decoder = newMarshal
def save_feature_database(features, outputfile, overwrite=None): """ Saves a SQLite-mode feature database. Result file will have the extension '.features' . """ if os.path.exists(outputfile): if overwrite != True: os.remove(outputfile) else: raise IOError("Target file %s already exists!" % outputfile) conn = sqlite3.connect(outputfile) cur = conn.cursor() createTable = "CREATE TABLE features(ind int, mz real, startscan int, endscan int, data text)" cur.execute(createTable) vprint("Created table.") for index, feature in enumerate(features): mz = feature.mz startscan, endscan = feature.scanrange featureData = base64.b64encode(pickle.dumps(feature, protocol=2)) addFeature = ('INSERT INTO features VALUES (%s, %s, %s, %s, "%s")' % (index, mz, startscan, endscan, featureData.decode())) cur.execute(addFeature) if index % 100 == 0: conn.commit() #print("TEST MODE") #sidechannel = open(outputfile + "SIDECHANNEL.pickle", 'wb') #pickle.dump(list(enumerate(features)), sidechannel) #sidechannel.close() vprint("Indexing...") createIndex = "CREATE INDEX mzindex ON features(mz, startscan)" cur.execute(createIndex) vprint("Analyzing...") cur.execute("ANALYZE") vprint("Final SQLite commit...") conn.commit() conn.close()
def detect_features(datafile, **constants): """ Runs the feature detection algorithm on the target data file (currently, only Thermo .RAW is supported.) Returns the path to the feature data file. Optional arguments: - tolerance (default 10): MZ tolerance in parts-per-million for all determinations of peak identity. Should usually correspond to the mass precision of the source instrument. - force (default False): If True, feature detection is run even if a feature data file already exists for the target data. """ if 'outputfile' in constants: featurefile = constants['outputfile'] else: featurefile = datafile + '.features' if 'tolerance' in constants and constants['tolerance']: global tolerance tolerance = constants['tolerance'] if tolerance < 1: print "\n\n\nWARNING- tolerance value for SILAC analysis should now be in PPM!\n\n\n" else: tolerance = 10 if 'partial' in constants: # This is primarily for testing purposes only. scanrange = constants['partial'] else: scanrange = None if 'force' in constants: force = constants['force'] else: force = False if 'whitelist_psms' in constants: whitelist_mzs = constants['whitelist_psms'] featurefile = datafile + '.partial%s.features' % (str( hash(frozenset(whitelist_mzs)))[:5]) else: whitelist_mzs = None if 'peak_picking_params' in constants: peak_pick_params = constants['peak_picking_params'] elif 'tolerance' in constants and constants['tolerance']: peak_pick_params = {'tolerance': constants['tolerance']} else: peak_pick_params = {'tolerance': 10} if os.path.exists(featurefile) and not force: vprint("Feature data file already exists: %s" % featurefile) return featurefile setGlobals(constants) times = [] times.append(time.clock()) data = mzFile(datafile) times.append(time.clock()) vprint("Opened data file; getting isotopes...") scaninfo = [x for x in data.scan_info(0, 99999999) if x[3] == 'MS1'] rtLookup = dict([(x[2], x[0]) for x in scaninfo]) scaninfo = [x[2] for x in scaninfo] if scanrange: scaninfo = [x for x in scaninfo if scanrange[0] < x < scanrange[1]] data.close() que = multiprocessing.Queue(maxsize=20) reader = multiprocessing.Process(target=dataReaderProc, args=(datafile, que, scaninfo)) reader.start() isotopeData = deque() thing = que.get(block=True) bar = 0 while thing != 'done': scanNum, scan = thing foo = time.clock() isotopeData.append((scanNum, peak_pick_PPM(scan, **peak_pick_params)[0])) bar += time.clock() - foo thing = que.get(block=True) if verbose_mode and len(isotopeData) % 100 == 0: print len(isotopeData) # Shielded by explicit verbose_mode check. reader.join() # Could just discard the un-feature'd peaks immediately. vprint("Isotopic features acquired; finding features over time...") times.append(time.clock()) ms1ToIndex = {} indexToMS1 = {} for index, scanNum in enumerate(scaninfo): ms1ToIndex[scanNum] = index indexToMS1[index] = scanNum isotopesByChargePoint = defaultdict(lambda: defaultdict( lambda: ProximityIndexedSequence([], lambda x: x[0][0]))) allIsotopes = [] for scanNum, isotopesByCharge in isotopeData: scanIndex = ms1ToIndex[scanNum] for charge, isotopes in isotopesByCharge.items(): for isoSeq in isotopes: isotopesByChargePoint[charge][scanIndex].add(isoSeq) allIsotopes.append((isoSeq, scanIndex, charge)) del isotopeData for scanlookup in isotopesByChargePoint.values(): for proxseq in scanlookup.values(): proxseq.rebalance() if whitelist_mzs: vprint("Screening out irrelevant MZs; starting with %s..." % len(allIsotopes)) allIsotopes.sort(key=lambda x: x[0][0][0]) whitelist_mzs = sorted(list(set([round(x, 2) for x in whitelist_mzs]))) isoAcc = [] whitemz = whitelist_mzs.pop() while allIsotopes: iso = allIsotopes.pop() mz = iso[0][0][0] while whitelist_mzs and whitemz - mz > whitelist_tol: whitemz = whitelist_mzs.pop() if abs(whitemz - mz) < whitelist_tol: isoAcc.append(iso) allIsotopes = isoAcc vprint("...%s remain." % len(allIsotopes)) allIsotopes.sort(key=lambda x: x[0][0][1]) times.append(time.clock()) seenIsotopes = set() # Can assume isotopic sequences are unique because floats. # (But it may not be a valid assumption, because detectors # and floating point approximations!) featureList = [] while allIsotopes: highIso, highScan, highChg = allIsotopes.pop() if tuple(highIso) in seenIsotopes: continue centerIndex, (centerMZ, _) = max(enumerate(highIso), key=lambda x: x[1][1]) newFeature = [[highScan, highIso]] curScan = highScan continuing = True lastSeen = rtLookup[indexToMS1[curScan]] while continuing: # Trailing the feature backwards. curScan -= 1 try: curRT = rtLookup[indexToMS1[curScan]] except KeyError: assert curScan < max(indexToMS1.keys()) break scanSeqs = isotopesByChargePoint[highChg][curScan].returnRange( centerMZ - 2, centerMZ + 1.5) scanSeqs.sort(key=lambda x: x[centerIndex][1], reverse=True) found = False for iso in scanSeqs: # These are known to have centerMZ in common. # The indexes between iso and highIso may not be equivalent # if there's sub-C12 peak(s) in either. For a first draft # this can be considered a feature, since C12s should be # consistent throughout features, but in some cases like # single-scan-dropouts of the C12 this is insufficient # and such discrepancies should be accounted for. if (inPPM(tolerance, iso[0][0], highIso[0][0]) and inPPM(tolerance, iso[1][0], highIso[1][0]) and tuple(iso) not in seenIsotopes): newFeature.append([curScan, iso]) found = True break # From "for iso in scanSeqs" if found: lastSeen = curRT elif abs(curRT - lastSeen) > dropoutTimeTolerance: continuing = False curScan = highScan continuing = True lastSeen = rtLookup[indexToMS1[curScan]] while continuing: # Trailing the feature forwards; mostly repeat code. curScan += 1 try: curRT = rtLookup[indexToMS1[curScan]] except KeyError: assert curScan > max(indexToMS1.keys()) break scanSeqs = isotopesByChargePoint[highChg][curScan].returnRange( centerMZ - 2, centerMZ + 1.5) scanSeqs.sort(key=lambda x: x[centerIndex][1], reverse=True) found = False for iso in scanSeqs: # These are known to have centerMZ in common. # Ditto. if (inPPM(tolerance, iso[0][0], highIso[0][0]) and inPPM(tolerance, iso[1][0], highIso[1][0]) and tuple(iso) not in seenIsotopes): newFeature.append([curScan, iso]) found = True break # From "for iso in scanSeqs" if found: lastSeen = curRT elif abs(curRT - lastSeen) > dropoutTimeTolerance: continuing = False if len(newFeature) > 1: featureList.append((highChg, newFeature)) for _, iso in newFeature: seenIsotopes.add(tuple(iso)) times.append(time.clock()) for chg, feature in featureList: for stage in feature: stage[0] = indexToMS1[stage[0]] class idLookup(): def __getitem__(self, thing): return thing lookup = idLookup() if scanrange: featurefile = datafile + ('%s-%s.features' % scanrange) featureObjects = [] for chg, feature in featureList: newfeature = Feature() for scan, envelope in feature: newfeature.add(envelope, scan, chg) newfeature.calculate_bounds(lookup) #newfeature.prepareBoxes(lookup) #newfeature.prepareBoxes() # It's entirely different, for some reason? #test = Feature() #for scan, envelope in feature: #test.add(envelope, scan, chg) #test.calculate_bounds(lookup) #assert test.mz == newfeature.mz and test.charge == newfeature.charge featureObjects.append(newfeature) save_feature_database(featureObjects, featurefile) vprint("Saved feature file.") times.append(time.clock()) return featurefile
def extract(datafile, outputfile=None, default_charge=2, centroid=True, scan_type=None, deisotope_and_reduce_charge=True, maximum_precursor_mass=15999, long_ms1=False, derive_precursor_via='All', deisotope_and_reduce_MS1_args={}, deisotope_and_reduce_MS2_args={}, min_mz=140, precursor_tolerance=0.005, isobaric_labels=None, label_tolerance=0.01, channel_corrections=None, prec_info_file=None, region_based_labels=False): """ Converts a mzAPI-compatible data file to MGF. Writes only MS2 spectra where these can be determined, otherwise takes every spectrum in the file. Likewise writes the precursor charge and mass if these can be determined. deisotope_and_reduce_charge deisotopes and charge-reduces each MS2 spectrum, which generally improves results from peptide database search algorithms. However, it should be disabled for very low-resolution scans. """ for key, val in [('tolerance', 0.01), ('min_peaks', 2), ('enforce_isotopic_ratios', True)]: if key not in deisotope_and_reduce_MS1_args: deisotope_and_reduce_MS1_args[key] = val if not outputfile: outputfile = datafile + '.mgf' if os.path.exists(outputfile): assert outputfile.lower().endswith('mgf'), ( "Overwriting a non-MGF file %s with " "the MGF extractor is probably a mistake." % outputfile) data = mzFile(datafile) from multiplierz.mgf.extraction import _extractor_ extractor = _extractor_(data, datafile, default_charge, centroid, scan_type, deisotope_and_reduce_charge, derive_precursor_via, maximum_precursor_mass, long_ms1, deisotope_and_reduce_MS1_args, deisotope_and_reduce_MS2_args, min_mz, precursor_tolerance, isobaric_labels, label_tolerance, channel_corrections, prec_info_file, region_based_labels) writer = MGF_Writer(outputfile) for scan, title, mz, charge in extractor.run(): writer.write(scan, title, mass=mz, charge=charge) writer.close() if extractor.inconsistent_precursors: vprint("Precursor inconsistencies: %s/%s" % (extractor.inconsistent_precursors, extractor.scans_written)) return outputfile
def extract(datafile, outputfile=None, default_charge=2, centroid=True, scan_type=None, deisotope_and_reduce_charge=True, deisotope_and_reduce_args={}, min_mz=140, precursor_tolerance=0.005, isobaric_labels=None, label_tolerance=0.01): """ Converts a mzAPI-compatible data file to MGF. Writes only MS2 spectra where these can be determined, otherwise takes every spectrum in the file. Likewise writes the precursor charge and mass if these can be determined. deisotope_and_reduce_charge deisotopes and charge-reduces each MS2 spectrum, which generally improves results from peptide database search algorithms. However, it should be disabled for very low-resolution scans. """ # Currently doesn't compensate for injection time! Would be required in # order to deal with iTRAQ/TMT labels. from multiplierz.spectral_process import deisotope_reduce_scan, peak_pick from multiplierz.spectral_process import centroid as centroid_func # Distinct from 'centroid' argument. def _get_precursor(mz, possible_prec, charge): try: return min([ x for x in possible_prec if (charge == None or x[1] == charge) ], key=lambda x: abs(x[0] - mz)) except ValueError: return None, None if not outputfile: outputfile = datafile + '.mgf' if os.path.exists(outputfile): assert outputfile.lower().endswith('mgf'), ( "Overwriting a non-MGF file %s with " "the MGF extractor is probably a mistake." % outputfile) writer = MGF_Writer(outputfile) data = mzFile(datafile) scanInfo = data.scan_info() # Coerce that scanInfo be in order of time, so that for .WIFF files # we can still use the previous-MS1 method to look up precursor charges. scanInfo.sort(key=lambda x: x[0]) if datafile.lower().endswith('.raw'): # May also exist for WIFF? filters = dict(data.filters()) # For RAW files only, there's the option to filter by a given # scan type. (It would be more efficient in many cases to # actually split files in a single run, though.) if scan_type: scan_type = scan_type assert (scan_type.lower() in ['cid', 'hcd', 'etd', 'etdsa']), ("Invalid scan type %s, must be one" "of (CID, HCD, ETD, ETDSA).") % scan_type typestr = "@%s" % scan_type.lower() scanInfo = [ x for x in scanInfo if x[3] == 'MS1' or typestr in filters[x[0]] ] else: filters = None assert not scan_type, "Scan type filtering only enabled with .RAW format files." if isobaric_labels: assert centroid, "Isobaric tags can only be read from centroided data; set 'centroid' to True." if not isobaric_labels: labels = [] elif isobaric_labels == 4 or isobaric_labels == '4plex': labels = zip([114, 115, 116, 117], [114.11, 115.11, 116.11, 117.12]) elif isobaric_labels == 6 or isobaric_labels == '6plex': labels = zip([126, 127, 128, 129, 130, 131], [126.127, 127.131, 128.134, 129.138, 130.141, 131.138]) elif isobaric_labels == 8 or isobaric_labels == '8plex': labels = zip( [113, 114, 115, 116, 117, 118, 119, 121], [113.11, 114.11, 115.11, 116.11, 117.12, 118.12, 119.12, 121.12]) elif isobaric_labels == 10 or isobaric_labels == '10plex': labels = zip([ '126', '127N', '127C', '128N', '128C', '129N', '129C', '130N', '130C', '131' ], [ 126.127726, 127.124761, 127.131081, 128.128116, 128.134436, 129.131471, 129.137790, 130.134825, 130.141145, 131.138180 ]) assert label_tolerance < 0.005, ( "label_tolerance must be lower " "than 0.005 for 10-plex experiments! (Currently %s)" % label_tolerance) else: raise NotImplementedError, ("Labels of type %s not recognized.\n" "Should be one of [4,6,8,10] or None.") def read_labels(scan): partscan = [x for x in scan if x[0] < labels[-1][1] + 3] if not partscan: return dict([(str(l), '0') for l in zip(*labels)[0]]) # This should probably actually sum all points within # the tolerance range. scan_values = {} for label, mz in labels: nearpt = min(partscan, key=lambda x: abs(x[0] - mz)) if abs(nearpt[0] - mz) < label_tolerance: scan_values[str(label)] = '%.3f' % nearpt[1] else: scan_values[str(label)] = '0' # Report noise value? return scan_values inconsistent_precursors = 0 scans_written = 0 lastMS1 = None lastMS1ScanName = None recal_factor = 1 calibrant = RAW_CAL_MASS for time, mz, scanNum, scanLevel, scanMode in scanInfo: scanName = scanNum if isinstance(scanNum, int) else time if scanLevel == 'MS1': lastMS1ScanName = scanName possible_precursors = None def calculate_precursors(calibrant): if data.format == 'raw': lastMS1 = data.lscan(lastMS1ScanName) lastMS1, calibrant = raw_scan_recalibration( lastMS1, calibrant) else: try: lastMS1 = data.scan(lastMS1ScanName, centroid=True) except NotImplementedError: lastMS1 = centroid_func(data.scan(lastMS1ScanName)) envelopes = peak_pick(lastMS1, tolerance=0.01, min_peaks=2, enforce_isotopic_ratios=True)[0] return sum([[(x[0][0], c) for x in xs] for c, xs in envelopes.items()], []), calibrant continue elif scanLevel == 'MS3': continue elif lastMS1ScanName == None: continue # Each file type handles centroiding differently (or not at all.) if data.format == 'raw': scan = data.scan(scanName, centroid=centroid) scan, calibrant = raw_scan_recalibration(scan, calibrant) elif data.format == 'wiff': # explicit_numbering, of course, can't be active here. scan = data.scan(scanName) if centroid: scan = centroid_func(scan) elif data.format == 'd': scan = data.scan(scanName, centroid=centroid) if centroid and not scan: # mzAPI.D returns empty if centroid data is not present in # the file, but that can be corrected by external centroiding. scan = centroid_func(data.scan(scanName, centroid=False)) else: raise NotImplementedError, "Extractor does not handle type %s" % data.format if filters and not mz: mz = float(filters[time].split('@')[0].split(' ')[-1]) mzP = None chargeP = None if "scanPrecursor" in dir(data): assert isinstance(scanName, int) mzP, chargeP = data.scanPrecursor(scanName) if not mzP: # .scanPrecursor sometimes returns charge and not mzP. if possible_precursors == None: possible_precursors, calibrant = calculate_precursors( calibrant) mzP, chargeP = _get_precursor(mz, possible_precursors, chargeP) if not mzP: # Release presumed charge possibly obtained from scanPrecursor. mzP, chargeP = _get_precursor(mz, possible_precursors, None) if mz and chargeP: inconsistent_precursors += 1 if mzP and (abs(mz - mzP) < 2 or not mz): mz = mzP charge = chargeP else: charge = default_charge if not charge: charge = default_charge if not mz: import warnings errmgf = os.path.abspath(datafile) warnings.warn('Unable to recover all precursor masses from %s' % errmgf) else: if labels: scan_labels = read_labels(scan) else: scan_labels = {} title = standard_title_write(datafile, rt=time, mz=mz, mode=scanMode, scan=scanNum, **scan_labels) # Should expand extract() call to include arguments to this. if deisotope_and_reduce_charge and centroid: if ('tolerance' not in deisotope_and_reduce_args or not deisotope_and_reduce_args['tolerance']): deisotope_and_reduce_args[ 'tolerance'] = precursor_tolerance scan = deisotope_reduce_scan(scan, **deisotope_and_reduce_args) scan = [x for x in scan if x[0] > min_mz] assert charge, title writer.write(scan, title, mass=mz, charge=charge) scans_written += 1 writer.close if inconsistent_precursors: vprint("Precursor inconsistencies: %s/%s" % (inconsistent_precursors, scans_written)) return outputfile