def __init__(self, path, *args, **kwargs): """ Imports mzML file, adds the chromatogram into a single spectrum. :param path: .mzML file path :param args: arguments (unused) :param kwargs: keywords (unused) :return: mzMLimporter object """ #TODO make this work """ del sys.modules[mzFile] print "Breaking" if mzFile not in sys.modules or merge_spectra not in sys.module: dlg = dlg = wx.MessageDialog(parent=None, message='Please install multiplierz and MSFileReader', caption='Error', style=wx.OK) dlg.ShowModal() return """ print "Reading Data:", path self.msrun = mzFile(path) self.scanrange = self.msrun.scan_range() self.scans = np.arange(self.scanrange[0], self.scanrange[1] + 1) self.times = [] self.data = [] for s in self.scans: impdat = np.array(self.msrun.scan(s)) impdat = impdat[impdat[:, 0] > 10] self.data.append(impdat) self.times.append(self.msrun.scan_time_from_scan_name(s)) self.times = np.array(self.times) self.data = np.array(self.data)
def getAcqPoints(datafile, resultFile): data = mzFile(datafile) scans = data.scan_info(0, 999999) ms2toms1 = {} ms1 = scans[0][2] ms2s = [] assert scans[0][3] == 'MS1' for scan in scans: if scan[3] == 'MS1': for ms2 in ms2s: ms2toms1[ms2] = ms1 ms1 = scan[2] ms2s = [] elif scan[3] == 'MS2': ms2s.append(scan[2]) else: raise Exception, "Unidentified scan type of %s" % scan[3] for ms2 in ms2s: ms2toms1[ms2] = ms1 acqPoints = [] for result in resultFile: mz = spectrumDescriptionToMZ(result['Spectrum Sescription']) scan = spectrumDescriptionToScanNumber(result['Spectrum Description']) scan = data.timeForScan(ms2toms1[scan]) acqPoints.append((mz, scan)) return acqPoints
def openDataFile(self, event): self.set_status("Opening MS data file...", 0) self.data = mzFile(self.dataCtrl.GetValue()) self.display.setData(self.data, lambda x: self.scanCtrl.SetValue(str(x))) self.set_status("Ready.", 0)
def __init__(self, path, *args, **kwargs): """ Imports mzML file, adds the chromatogram into a single spectrum. :param path: .mzML file path :param args: arguments (unused) :param kwargs: keywords (unused) :return: mzMLimporter object """ # TODO make this work """ del sys.modules[mzFile] print "Breaking" if mzFile not in sys.modules or merge_spectra not in sys.module: dlg = dlg = wx.MessageDialog(parent=None, message='Please install multiplierz and MSFileReader', caption='Error', style=wx.OK) dlg.ShowModal() return """ print("Reading Data:", path) try: self.msrun = mzFile(path) except: register() self.msrun = mzFile(path) self.scanrange = self.msrun.scan_range() # print(self.scanrange) self.scans = np.arange(self.scanrange[0], self.scanrange[1]) self.times = [] self.data = None for s in self.scans: s = s - 1 try: self.times.append(self.msrun.scan_time_from_scan_name(s)) except Exception as e: try: t = self.msrun.info[s][0] self.times.append(t) except Exception as e2: try: t = self.msrun.scan_info()[s][0] self.times.append(t) except Exception as e3: print("Error getting scan times:", e, e2, e3) print("Using Scan rather than Time") self.times.append(s) self.times = np.array(self.times)
def async_mzFile_internal(datafile, size_cap, input, output): try: from collections import deque from Queue import Empty data = mzFile(datafile) cacheRecord = deque() cache = {} commands = [] while True: if not commands: commands = [input.get(block=True)] while True: try: commands.append(input.get_nowait()) except Empty: break if any(x[0] == 'close' for x in commands): break call = next((x for x in commands if x[0] == 'call'), None) if call: commands.remove(call) _, method, argkwarg = call callhash = arghash(method, argkwarg) if callhash in cache: output.put((call, cache[callhash])) else: args, kwargs = argkwarg returnval = getattr(data, method)(*args, **kwargs) cache[callhash] = returnval output.put((call, returnval)) else: assert commands com = commands[0] _, method, argkwarg = com callhash = arghash(method, argkwarg) commands = commands[1:] if callhash in cache: continue args, kwargs = argkwarg returnval = getattr(data, method)(*args, **kwargs) cache[callhash] = returnval return except Exception as err: import traceback traceback.print_exc() print '------------------' raise err
def dataReaderProc(datafile, que, scanNumbers): try: data = mzFile(datafile) for scanNum in scanNumbers: scan = data.scan(scanNum, centroid=True) que.put((scanNum, scan), block=True) que.put('done') data.close() except Exception as err: import traceback print "READ THREAD ERROR." traceback.print_exc() print '------------------' raise err
def detect_features(datafile, **constants): """ Runs the feature detection algorithm on the target data file (currently, only Thermo .RAW is supported.) Returns the path to the feature data file. Optional arguments: - tolerance (default 10): MZ tolerance in parts-per-million for all determinations of peak identity. Should usually correspond to the mass precision of the source instrument. - force (default False): If True, feature detection is run even if a feature data file already exists for the target data. """ if 'outputfile' in constants: featurefile = constants['outputfile'] else: featurefile = datafile + '.features' if 'tolerance' in constants and constants['tolerance']: global tolerance tolerance = constants['tolerance'] if tolerance < 1: print "\n\n\nWARNING- tolerance value for SILAC analysis should now be in PPM!\n\n\n" else: tolerance = 10 if 'partial' in constants: # This is primarily for testing purposes only. scanrange = constants['partial'] else: scanrange = None if 'force' in constants: force = constants['force'] else: force = False if 'whitelist_psms' in constants: whitelist_mzs = constants['whitelist_psms'] featurefile = datafile + '.partial%s.features' % (str( hash(frozenset(whitelist_mzs)))[:5]) else: whitelist_mzs = None if 'peak_picking_params' in constants: peak_pick_params = constants['peak_picking_params'] elif 'tolerance' in constants and constants['tolerance']: peak_pick_params = {'tolerance': constants['tolerance']} else: peak_pick_params = {'tolerance': 10} if os.path.exists(featurefile) and not force: vprint("Feature data file already exists: %s" % featurefile) return featurefile setGlobals(constants) times = [] times.append(time.clock()) data = mzFile(datafile) times.append(time.clock()) vprint("Opened data file; getting isotopes...") scaninfo = [x for x in data.scan_info(0, 99999999) if x[3] == 'MS1'] rtLookup = dict([(x[2], x[0]) for x in scaninfo]) scaninfo = [x[2] for x in scaninfo] if scanrange: scaninfo = [x for x in scaninfo if scanrange[0] < x < scanrange[1]] data.close() que = multiprocessing.Queue(maxsize=20) reader = multiprocessing.Process(target=dataReaderProc, args=(datafile, que, scaninfo)) reader.start() isotopeData = deque() thing = que.get(block=True) bar = 0 while thing != 'done': scanNum, scan = thing foo = time.clock() isotopeData.append((scanNum, peak_pick_PPM(scan, **peak_pick_params)[0])) bar += time.clock() - foo thing = que.get(block=True) if verbose_mode and len(isotopeData) % 100 == 0: print len(isotopeData) # Shielded by explicit verbose_mode check. reader.join() # Could just discard the un-feature'd peaks immediately. vprint("Isotopic features acquired; finding features over time...") times.append(time.clock()) ms1ToIndex = {} indexToMS1 = {} for index, scanNum in enumerate(scaninfo): ms1ToIndex[scanNum] = index indexToMS1[index] = scanNum isotopesByChargePoint = defaultdict(lambda: defaultdict( lambda: ProximityIndexedSequence([], lambda x: x[0][0]))) allIsotopes = [] for scanNum, isotopesByCharge in isotopeData: scanIndex = ms1ToIndex[scanNum] for charge, isotopes in isotopesByCharge.items(): for isoSeq in isotopes: isotopesByChargePoint[charge][scanIndex].add(isoSeq) allIsotopes.append((isoSeq, scanIndex, charge)) del isotopeData for scanlookup in isotopesByChargePoint.values(): for proxseq in scanlookup.values(): proxseq.rebalance() if whitelist_mzs: vprint("Screening out irrelevant MZs; starting with %s..." % len(allIsotopes)) allIsotopes.sort(key=lambda x: x[0][0][0]) whitelist_mzs = sorted(list(set([round(x, 2) for x in whitelist_mzs]))) isoAcc = [] whitemz = whitelist_mzs.pop() while allIsotopes: iso = allIsotopes.pop() mz = iso[0][0][0] while whitelist_mzs and whitemz - mz > whitelist_tol: whitemz = whitelist_mzs.pop() if abs(whitemz - mz) < whitelist_tol: isoAcc.append(iso) allIsotopes = isoAcc vprint("...%s remain." % len(allIsotopes)) allIsotopes.sort(key=lambda x: x[0][0][1]) times.append(time.clock()) seenIsotopes = set() # Can assume isotopic sequences are unique because floats. # (But it may not be a valid assumption, because detectors # and floating point approximations!) featureList = [] while allIsotopes: highIso, highScan, highChg = allIsotopes.pop() if tuple(highIso) in seenIsotopes: continue centerIndex, (centerMZ, _) = max(enumerate(highIso), key=lambda x: x[1][1]) newFeature = [[highScan, highIso]] curScan = highScan continuing = True lastSeen = rtLookup[indexToMS1[curScan]] while continuing: # Trailing the feature backwards. curScan -= 1 try: curRT = rtLookup[indexToMS1[curScan]] except KeyError: assert curScan < max(indexToMS1.keys()) break scanSeqs = isotopesByChargePoint[highChg][curScan].returnRange( centerMZ - 2, centerMZ + 1.5) scanSeqs.sort(key=lambda x: x[centerIndex][1], reverse=True) found = False for iso in scanSeqs: # These are known to have centerMZ in common. # The indexes between iso and highIso may not be equivalent # if there's sub-C12 peak(s) in either. For a first draft # this can be considered a feature, since C12s should be # consistent throughout features, but in some cases like # single-scan-dropouts of the C12 this is insufficient # and such discrepancies should be accounted for. if (inPPM(tolerance, iso[0][0], highIso[0][0]) and inPPM(tolerance, iso[1][0], highIso[1][0]) and tuple(iso) not in seenIsotopes): newFeature.append([curScan, iso]) found = True break # From "for iso in scanSeqs" if found: lastSeen = curRT elif abs(curRT - lastSeen) > dropoutTimeTolerance: continuing = False curScan = highScan continuing = True lastSeen = rtLookup[indexToMS1[curScan]] while continuing: # Trailing the feature forwards; mostly repeat code. curScan += 1 try: curRT = rtLookup[indexToMS1[curScan]] except KeyError: assert curScan > max(indexToMS1.keys()) break scanSeqs = isotopesByChargePoint[highChg][curScan].returnRange( centerMZ - 2, centerMZ + 1.5) scanSeqs.sort(key=lambda x: x[centerIndex][1], reverse=True) found = False for iso in scanSeqs: # These are known to have centerMZ in common. # Ditto. if (inPPM(tolerance, iso[0][0], highIso[0][0]) and inPPM(tolerance, iso[1][0], highIso[1][0]) and tuple(iso) not in seenIsotopes): newFeature.append([curScan, iso]) found = True break # From "for iso in scanSeqs" if found: lastSeen = curRT elif abs(curRT - lastSeen) > dropoutTimeTolerance: continuing = False if len(newFeature) > 1: featureList.append((highChg, newFeature)) for _, iso in newFeature: seenIsotopes.add(tuple(iso)) times.append(time.clock()) for chg, feature in featureList: for stage in feature: stage[0] = indexToMS1[stage[0]] class idLookup(): def __getitem__(self, thing): return thing lookup = idLookup() if scanrange: featurefile = datafile + ('%s-%s.features' % scanrange) featureObjects = [] for chg, feature in featureList: newfeature = Feature() for scan, envelope in feature: newfeature.add(envelope, scan, chg) newfeature.calculate_bounds(lookup) #newfeature.prepareBoxes(lookup) #newfeature.prepareBoxes() # It's entirely different, for some reason? #test = Feature() #for scan, envelope in feature: #test.add(envelope, scan, chg) #test.calculate_bounds(lookup) #assert test.mz == newfeature.mz and test.charge == newfeature.charge featureObjects.append(newfeature) save_feature_database(featureObjects, featurefile) vprint("Saved feature file.") times.append(time.clock()) return featurefile
def binByFullFeature(datafile, featureDB, results): data = mzFile(datafile) scans = data.scan_info(0, 999999) ms2toms1 = {} ms1 = None ms2s = [] # MS2s are dropped until the first MS1. for scan in scans: if scan[3] == 'MS1': for ms2 in ms2s: ms2toms1[ms2] = ms1 ms1 = scan[2] ms2s = [] elif scan[3] == 'MS2': if ms1 != None: ms2s.append(scan[2]) else: raise Exception, "Unidentified scan type of %s" % scan[3] for ms2 in ms2s: ms2toms1[ms2] = ms1 matchesToSplits = 0 matchesToUnsplit = 0 featureItems = defaultdict(list) edgeItems = defaultdict(list) inexplicableItems = [] for result in results: #mz = spectrumDescriptionToMZ(result['Spectrum Description']) #scan = spectrumDescriptionToScanNumber(result['Spectrum Description']) mz = mzFromPSM(result) scan = scanFromPSM(result) charge = int(result['Charge']) try: scan = ms2toms1[scan] except: continue features = [(i, x) for i, x in featureDB.mz_range(mz - 0.01, mz + 0.01) if x.containsPoint(mz, scan, charge)] if features: index, feature = min(features, key=lambda x: abs(x[1].mz - mz)) scans = min(feature.scans), max(feature.scans) intensity = feature.c12Intensity() kurtosis = feature.kurtosis skew = feature.skewness featureItems[index].append( (result, scans, intensity, kurtosis, skew)) else: features = [(i, x) for i, x in featureDB.mz_range(mz - 1, mz + 1) if x.bordersPoint(mz, scan, charge)] if features: index, feature = min(features, key=lambda x: abs(x[1].mz - mz)) edge = feature.bordersPoint(mz, scan, charge) scans = min(feature.scans), max(feature.scans) intensity = feature.c12Intensity() kurtosis = feature.kurtosis skew = feature.skewness edgeItems[index].append( (result, edge, scans, intensity, kurtosis, skew)) else: inexplicableItems.append(result) groupedResults = [] overFitCount = 0 for feature, results in featureItems.items(): try: pep = results[0][0]['Peptide Sequence'] if not all( [x['Peptide Sequence'] == pep for x, s, i, k, sk in results]): overFitCount += 1 except KeyError: pep = results[0][0]['Annotated Sequence'] if not all( [x['Annotated Sequence'] == pep for x, s, i, k, sk in results]): overFitCount += 1 for result, scans, intensity, kurtosis, skew in results: result['Feature'] = feature result['feature error'] = '-' result['feature start scan'] = scans[0] result['feature end scan'] = scans[1] result['feature start time'] = data.timeForScan( scans[0]) if scans[0] else '-' result['feature end time'] = data.timeForScan( scans[1]) if scans[1] else '-' result['feature intensity'] = intensity result['feature kurtosis'] = kurtosis result['feature skewness'] = skew groupedResults.append(result) for feature, resultEdges in edgeItems.items(): for result, edge, scans, intensity, kurtosis, skew in resultEdges: result['Feature'] = '-' result['feature error'] = str(feature) + " " + edge result['feature start scan'] = scans[0] result['feature end scan'] = scans[1] result['feature start time'] = data.timeForScan( scans[0]) if scans[0] else '-' result['feature end time'] = data.timeForScan( scans[1]) if scans[1] else '-' result['feature intensity'] = intensity result['feature kurtosis'] = kurtosis result['feature skewness'] = skew groupedResults.append(result) for result in inexplicableItems: result['Feature'] = '-' result['feature error'] = 'Feature not found' result['feature start scan'] = '-' result['feature end scan'] = '-' result['feature start time'] = '-' result['feature end time'] = '-' result['feature intensity'] = '-' result['feature kurtosis'] = '-' result['feature skewness'] = '-' groupedResults.append(result) data.close() return groupedResults
def extract(datafile, outputfile=None, default_charge=2, centroid=True, scan_type=None, deisotope_and_reduce_charge=True, maximum_precursor_mass=15999, long_ms1=False, derive_precursor_via='All', deisotope_and_reduce_MS1_args={}, deisotope_and_reduce_MS2_args={}, min_mz=140, precursor_tolerance=0.005, isobaric_labels=None, label_tolerance=0.01, channel_corrections=None, prec_info_file=None, region_based_labels=False): """ Converts a mzAPI-compatible data file to MGF. Writes only MS2 spectra where these can be determined, otherwise takes every spectrum in the file. Likewise writes the precursor charge and mass if these can be determined. deisotope_and_reduce_charge deisotopes and charge-reduces each MS2 spectrum, which generally improves results from peptide database search algorithms. However, it should be disabled for very low-resolution scans. """ for key, val in [('tolerance', 0.01), ('min_peaks', 2), ('enforce_isotopic_ratios', True)]: if key not in deisotope_and_reduce_MS1_args: deisotope_and_reduce_MS1_args[key] = val if not outputfile: outputfile = datafile + '.mgf' if os.path.exists(outputfile): assert outputfile.lower().endswith('mgf'), ( "Overwriting a non-MGF file %s with " "the MGF extractor is probably a mistake." % outputfile) data = mzFile(datafile) from multiplierz.mgf.extraction import _extractor_ extractor = _extractor_(data, datafile, default_charge, centroid, scan_type, deisotope_and_reduce_charge, derive_precursor_via, maximum_precursor_mass, long_ms1, deisotope_and_reduce_MS1_args, deisotope_and_reduce_MS2_args, min_mz, precursor_tolerance, isobaric_labels, label_tolerance, channel_corrections, prec_info_file, region_based_labels) writer = MGF_Writer(outputfile) for scan, title, mz, charge in extractor.run(): writer.write(scan, title, mass=mz, charge=charge) writer.close() if extractor.inconsistent_precursors: vprint("Precursor inconsistencies: %s/%s" % (extractor.inconsistent_precursors, extractor.scans_written)) return outputfile
def writeIonAnnotations(self, datafile=None, in_place=False): for spectrumList in self.root.getiterator( self.pfx + "SpectrumIdentificationList"): try: fragtab = [ x for x in spectrumList if x.tag == self.pfx + "FragmentationTable" ][0] except IndexError: fragtab = xml.SubElement(spectrumList, "FragmentationTable") intMeasure = xml.SubElement(fragtab, "Measure") intMeasure.set("id", "m_intensity") intMeasureKind = xml.SubElement(intMeasure, "cvParam") intMeasureKind.set("cvRef", "PSI-MS") intMeasureKind.set("accession", "MS:1001226") intMeasureKind.set("name", "product ion intensity") mzMeasure = xml.SubElement(fragtab, "Measure") mzMeasure.set("id", "m_mz") mzMeasureKind = xml.SubElement(mzMeasure, "cvParam") mzMeasureKind.set("cvRef", "PSI-MS") mzMeasureKind.set("accession", "MS:1001225") mzMeasureKind.set("name", "product ion m/z") for spectrumResult in self.root.getiterator( self.pfx + "SpectrumIdentificationResult"): dataEl = self.fileLookup[spectrumResult.get("spectraData_ref")] spectrumTitle = self.giveCVs(spectrumResult)['spectrum title'] derivedData, scanNum = parseSpectrumTitle(spectrumTitle) if not datafile: #datafile = dataEl.get("location") datafile = derivedData try: data = self.filePointers[datafile] except KeyError: data = mzFile(datafile) self.filePointers[datafile] = data #rT = float(self.giveCVs(spectrumResult)["MS:1001114"]) / 60.0 #scanName = spectrumResult.get("spectrumID") # Perhaps? Not entirely clear. #try: #scanNum = int(scanName) #except ValueError: #scanNum = int(scanName.split("=")[1]) #scan = data.cscan(data.scan_time_from_scan_name(scanNum)) #for spectrumItem in [x for x in spectrumResult #if x.tag == (self.pfx + 'SpectrumIdentificationItem')]: for spectrumItem in spectrumResult.getiterator( self.pfx + 'SpectrumIdentificationItem'): #mz = float(spectrumItem.get("experimentalMassToCharge")) #scanHeader = min([x for x in data.scan_info(rT - 0.1, rT + 0.1, mz - 1, mz + 1) #if x[3] == 'MS2'], #key = lambda x: abs(x[1] - mz)) #scan = data.scan(scanHeader[0], centroid = True) scan = data.scan(scanNum, centroid=True) if len(scan) > 500: scan = sorted(scan, key=lambda x: x[1], reverse=True)[:500] #try: #scans = self.dataFileScans[datafile] #except KeyError: #scans = data.scan_info() #self.dataFileScans[datafile] = scans try: #fragmentation = [x for x in spectrumItem #if x.tag == (self.pfx + "Fragmentation")][0] fragmentation = spectrumItem.getiterator( self.pfx + 'Fragmentation').next() except StopIteration: fragmentation = xml.SubElement(spectrumItem, self.pfx + "Fragmentation") #fragmentation = [x for x in spectrumItem #if x.tag == (self.pfx + "Fragmentation")][0] iontype = xml.SubElement(fragmentation, self.pfx + "IonType") iontype.set("index", "0 " * len(scan)) iontype.set("charge", "0") ionKind = xml.SubElement(iontype, self.pfx + "cvParam") ionKind.set("cvRef", "PSI-MS") ionKind.set("accession", "MS:1001240") ionKind.set("name", "non-identified ion") def listStr(thing): out = "" for x in thing: out += (str(x) + " ") return out mzArray = xml.SubElement(iontype, self.pfx + "FragmentArray") mzArray.set("values", listStr(unzip(scan)[0])) mzArray.set("measure_ref", "m_mz") intArray = xml.SubElement(iontype, self.pfx + "FragmentArray") intArray.set("values", listStr(unzip(scan)[1])) intArray.set("measure_ref", "m_intensity") if not in_place: outputFile = self.filename[:-5] + "_annotated.mzid" else: outputFile = self.filename softwareUsed = self.root.getiterator(self.pfx + "AnalysisSoftwareList").next() mzDesktopEl = xml.SubElement(softwareUsed, self.pfx + "AnalysisSoftware") mzDesktopEl.set("id", "DFCI Multiplierz v1.1.0") mzDesktopEl.set("name", "Multiplierz") mzDesktopEl.set("uri", "http://sourceforge.net/projects/multiplierz/") mzDesktopEl.set("version", __version__) softwareName = xml.SubElement(mzDesktopEl, "SoftwareName") nameParam = xml.SubElement(softwareName, "userParam") nameParam.set("name", "Multiplierz") self.mzid.close() output = open(outputFile, "w") self.tree.write(output) output.close() self.mzid = open(outputFile, "r")
def extract(datafile, outputfile=None, default_charge=2, centroid=True, scan_type=None, deisotope_and_reduce_charge=True, deisotope_and_reduce_args={}, min_mz=140, precursor_tolerance=0.005, isobaric_labels=None, label_tolerance=0.01): """ Converts a mzAPI-compatible data file to MGF. Writes only MS2 spectra where these can be determined, otherwise takes every spectrum in the file. Likewise writes the precursor charge and mass if these can be determined. deisotope_and_reduce_charge deisotopes and charge-reduces each MS2 spectrum, which generally improves results from peptide database search algorithms. However, it should be disabled for very low-resolution scans. """ # Currently doesn't compensate for injection time! Would be required in # order to deal with iTRAQ/TMT labels. from multiplierz.spectral_process import deisotope_reduce_scan, peak_pick from multiplierz.spectral_process import centroid as centroid_func # Distinct from 'centroid' argument. def _get_precursor(mz, possible_prec, charge): try: return min([ x for x in possible_prec if (charge == None or x[1] == charge) ], key=lambda x: abs(x[0] - mz)) except ValueError: return None, None if not outputfile: outputfile = datafile + '.mgf' if os.path.exists(outputfile): assert outputfile.lower().endswith('mgf'), ( "Overwriting a non-MGF file %s with " "the MGF extractor is probably a mistake." % outputfile) writer = MGF_Writer(outputfile) data = mzFile(datafile) scanInfo = data.scan_info() # Coerce that scanInfo be in order of time, so that for .WIFF files # we can still use the previous-MS1 method to look up precursor charges. scanInfo.sort(key=lambda x: x[0]) if datafile.lower().endswith('.raw'): # May also exist for WIFF? filters = dict(data.filters()) # For RAW files only, there's the option to filter by a given # scan type. (It would be more efficient in many cases to # actually split files in a single run, though.) if scan_type: scan_type = scan_type assert (scan_type.lower() in ['cid', 'hcd', 'etd', 'etdsa']), ("Invalid scan type %s, must be one" "of (CID, HCD, ETD, ETDSA).") % scan_type typestr = "@%s" % scan_type.lower() scanInfo = [ x for x in scanInfo if x[3] == 'MS1' or typestr in filters[x[0]] ] else: filters = None assert not scan_type, "Scan type filtering only enabled with .RAW format files." if isobaric_labels: assert centroid, "Isobaric tags can only be read from centroided data; set 'centroid' to True." if not isobaric_labels: labels = [] elif isobaric_labels == 4 or isobaric_labels == '4plex': labels = zip([114, 115, 116, 117], [114.11, 115.11, 116.11, 117.12]) elif isobaric_labels == 6 or isobaric_labels == '6plex': labels = zip([126, 127, 128, 129, 130, 131], [126.127, 127.131, 128.134, 129.138, 130.141, 131.138]) elif isobaric_labels == 8 or isobaric_labels == '8plex': labels = zip( [113, 114, 115, 116, 117, 118, 119, 121], [113.11, 114.11, 115.11, 116.11, 117.12, 118.12, 119.12, 121.12]) elif isobaric_labels == 10 or isobaric_labels == '10plex': labels = zip([ '126', '127N', '127C', '128N', '128C', '129N', '129C', '130N', '130C', '131' ], [ 126.127726, 127.124761, 127.131081, 128.128116, 128.134436, 129.131471, 129.137790, 130.134825, 130.141145, 131.138180 ]) assert label_tolerance < 0.005, ( "label_tolerance must be lower " "than 0.005 for 10-plex experiments! (Currently %s)" % label_tolerance) else: raise NotImplementedError, ("Labels of type %s not recognized.\n" "Should be one of [4,6,8,10] or None.") def read_labels(scan): partscan = [x for x in scan if x[0] < labels[-1][1] + 3] if not partscan: return dict([(str(l), '0') for l in zip(*labels)[0]]) # This should probably actually sum all points within # the tolerance range. scan_values = {} for label, mz in labels: nearpt = min(partscan, key=lambda x: abs(x[0] - mz)) if abs(nearpt[0] - mz) < label_tolerance: scan_values[str(label)] = '%.3f' % nearpt[1] else: scan_values[str(label)] = '0' # Report noise value? return scan_values inconsistent_precursors = 0 scans_written = 0 lastMS1 = None lastMS1ScanName = None recal_factor = 1 calibrant = RAW_CAL_MASS for time, mz, scanNum, scanLevel, scanMode in scanInfo: scanName = scanNum if isinstance(scanNum, int) else time if scanLevel == 'MS1': lastMS1ScanName = scanName possible_precursors = None def calculate_precursors(calibrant): if data.format == 'raw': lastMS1 = data.lscan(lastMS1ScanName) lastMS1, calibrant = raw_scan_recalibration( lastMS1, calibrant) else: try: lastMS1 = data.scan(lastMS1ScanName, centroid=True) except NotImplementedError: lastMS1 = centroid_func(data.scan(lastMS1ScanName)) envelopes = peak_pick(lastMS1, tolerance=0.01, min_peaks=2, enforce_isotopic_ratios=True)[0] return sum([[(x[0][0], c) for x in xs] for c, xs in envelopes.items()], []), calibrant continue elif scanLevel == 'MS3': continue elif lastMS1ScanName == None: continue # Each file type handles centroiding differently (or not at all.) if data.format == 'raw': scan = data.scan(scanName, centroid=centroid) scan, calibrant = raw_scan_recalibration(scan, calibrant) elif data.format == 'wiff': # explicit_numbering, of course, can't be active here. scan = data.scan(scanName) if centroid: scan = centroid_func(scan) elif data.format == 'd': scan = data.scan(scanName, centroid=centroid) if centroid and not scan: # mzAPI.D returns empty if centroid data is not present in # the file, but that can be corrected by external centroiding. scan = centroid_func(data.scan(scanName, centroid=False)) else: raise NotImplementedError, "Extractor does not handle type %s" % data.format if filters and not mz: mz = float(filters[time].split('@')[0].split(' ')[-1]) mzP = None chargeP = None if "scanPrecursor" in dir(data): assert isinstance(scanName, int) mzP, chargeP = data.scanPrecursor(scanName) if not mzP: # .scanPrecursor sometimes returns charge and not mzP. if possible_precursors == None: possible_precursors, calibrant = calculate_precursors( calibrant) mzP, chargeP = _get_precursor(mz, possible_precursors, chargeP) if not mzP: # Release presumed charge possibly obtained from scanPrecursor. mzP, chargeP = _get_precursor(mz, possible_precursors, None) if mz and chargeP: inconsistent_precursors += 1 if mzP and (abs(mz - mzP) < 2 or not mz): mz = mzP charge = chargeP else: charge = default_charge if not charge: charge = default_charge if not mz: import warnings errmgf = os.path.abspath(datafile) warnings.warn('Unable to recover all precursor masses from %s' % errmgf) else: if labels: scan_labels = read_labels(scan) else: scan_labels = {} title = standard_title_write(datafile, rt=time, mz=mz, mode=scanMode, scan=scanNum, **scan_labels) # Should expand extract() call to include arguments to this. if deisotope_and_reduce_charge and centroid: if ('tolerance' not in deisotope_and_reduce_args or not deisotope_and_reduce_args['tolerance']): deisotope_and_reduce_args[ 'tolerance'] = precursor_tolerance scan = deisotope_reduce_scan(scan, **deisotope_and_reduce_args) scan = [x for x in scan if x[0] > min_mz] assert charge, title writer.write(scan, title, mass=mz, charge=charge) scans_written += 1 writer.close if inconsistent_precursors: vprint("Precursor inconsistencies: %s/%s" % (inconsistent_precursors, scans_written)) return outputfile
def writeIonAnnotations(self, datafile=None, in_place=False): for spectrumList in self.root.getiterator( self.pfx + "SpectrumIdentificationList"): try: fragtab = [ x for x in spectrumList if x.tag == self.pfx + "FragmentationTable" ][0] except IndexError: fragtab = xml.SubElement(spectrumList, "FragmentationTable") intMeasure = xml.SubElement(fragtab, "Measure") intMeasure.set("id", "m_intensity") intMeasureKind = xml.SubElement(intMeasure, "cvParam") intMeasureKind.set("cvRef", "PSI-MS") intMeasureKind.set("accession", "MS:1001226") intMeasureKind.set("name", "product ion intensity") mzMeasure = xml.SubElement(fragtab, "Measure") mzMeasure.set("id", "m_mz") mzMeasureKind = xml.SubElement(mzMeasure, "cvParam") mzMeasureKind.set("cvRef", "PSI-MS") mzMeasureKind.set("accession", "MS:1001225") mzMeasureKind.set("name", "product ion m/z") for spectrumResult in self.root.getiterator( self.pfx + "SpectrumIdentificationResult"): dataEl = self.fileLookup[spectrumResult.get("spectraData_ref")] spectrumTitle = self.giveCVs(spectrumResult)['spectrum title'] derivedData, scanNum = parseSpectrumTitle(spectrumTitle) if not datafile: datafile = derivedData try: data = self.filePointers[datafile] except KeyError: data = mzFile(datafile) self.filePointers[datafile] = data for spectrumItem in spectrumResult.getiterator( self.pfx + 'SpectrumIdentificationItem'): scan = data.scan(scanNum, centroid=True) if len(scan) > 500: scan = sorted(scan, key=lambda x: x[1], reverse=True)[:500] try: fragmentation = next( spectrumItem.getiterator(self.pfx + 'Fragmentation')) except StopIteration: fragmentation = xml.SubElement(spectrumItem, self.pfx + "Fragmentation") iontype = xml.SubElement(fragmentation, self.pfx + "IonType") iontype.set("index", "0 " * len(scan)) iontype.set("charge", "0") ionKind = xml.SubElement(iontype, self.pfx + "cvParam") ionKind.set("cvRef", "PSI-MS") ionKind.set("accession", "MS:1001240") ionKind.set("name", "non-identified ion") def listStr(thing): out = "" for x in thing: out += (str(x) + " ") return out mzArray = xml.SubElement(iontype, self.pfx + "FragmentArray") mzArray.set("values", listStr(unzip(scan)[0])) mzArray.set("measure_ref", "m_mz") intArray = xml.SubElement(iontype, self.pfx + "FragmentArray") intArray.set("values", listStr(unzip(scan)[1])) intArray.set("measure_ref", "m_intensity") if not in_place: outputFile = self.filename[:-5] + "_annotated.mzid" else: outputFile = self.filename softwareUsed = next( self.root.getiterator(self.pfx + "AnalysisSoftwareList")) mzDesktopEl = xml.SubElement(softwareUsed, self.pfx + "AnalysisSoftware") mzDesktopEl.set("id", "DFCI Multiplierz v1.1.0") mzDesktopEl.set("name", "Multiplierz") mzDesktopEl.set("uri", "http://sourceforge.net/projects/multiplierz/") mzDesktopEl.set("version", __version__) softwareName = xml.SubElement(mzDesktopEl, "SoftwareName") nameParam = xml.SubElement(softwareName, "userParam") nameParam.set("name", "Multiplierz") self.mzid.close() output = open(outputFile, "w") self.tree.write(output) output.close() self.mzid = open(outputFile, "r")
def evaluateMascotFile(resultfile, datafile = None, featurefile = None, outputfile = None): #assert datafile or featurefile, "Either raw data or feature data must be given!" header = [list(x.values()) for x in list(reader(resultfile, sheet_name = 'Mascot_Header'))] def retrieveHeaderValue(key): try: return [[x for x in xs if x != key] for xs in header if key in xs][0][0] except IndexError: return '' quant = retrieveHeaderValue('Quantitation method') varmods = retrieveHeaderValue('Variable modifications') assert ('SILAC' in quant) or ('plex' in varmods), "Label method not recognized!" if not featurefile: featurefile = detectFeatures(datafile, signalToNoiseThreshold = 15) features = FeatureInterface(featurefile) else: features = FeatureInterface(featurefile) print("Matching features to PSMs...") results = reader(resultfile) columns = results.columns results = list(results) data = mzFile(datafile) ms1map = {} ms2s = [] ms1 = None for _, _, scan, level, _ in data.scan_info(0, 999999): if level == 'MS1': for ms2 in ms2s: ms1map[ms2] = ms1 ms1 = scan ms2s = [] elif level == 'MS2': ms2s.append(scan) ms1map[ms1] = ms2s data.close() featureIntMap = {} for psm in results: mz = psm['Experimental mz'] scan = int(psm['Spectrum Description'].split('.')[1]) charge = int(psm['Charge']) for index, feature in features.mz_range(mz - 1, mz + 1): if feature.containsPoint(mz, ms1map[scan], charge): featureIntMap[scan] = feature.c12Intensity() break del features if not outputfile: outputfile = '.'.join(resultfile.split('.')[:-1]) + '_LABEL_EVALUATION.xlsx' if 'SILAC' in quant: return evaluateSILAC(outputfile, columns, results, featureIntMap), outputfile elif 'plex' in varmods: return evaluateTMTiTRAQ(outputfile, columns, results, featureIntMap), outputfile
def psm_XIC_localized(directory, subdirs): """ A peptide may appear in multiple fractions due various factors, but for the purpose of this analysis it is useful to consider a peptide as "belonging" only to the fraction in which the main bulk of the elution occurred. For each fraction in which a given peptide appeared, we take XICs over the m/z values for a set of possible charge and compare their total intensity; the fraction with the most intense XIC(s) is assigned that peptide for the final count. """ tolerance = 0.1 time_tolerance = 15 rawfiles = dict([(x.split('.')[0], mzFile(os.path.join(directory, x))) for x in os.listdir(directory) if x.lower().endswith('raw')]) columns = None start = time.clock() for subdir in subdirs: resultfiles = typeInDir(os.path.join(directory, subdir), 'xlsx') resultfiles = [x for x in resultfiles if 'XIC_localized' not in x] peptidesForFile = defaultdict(dict) for resultfile in resultfiles: rdr = reader(resultfile) columns = rdr.columns psmsByPeptide = collectByCriterion( list(rdr), lambda x: (x['Peptide Sequence'], x['Variable Modifications'])) for peptide, psms in psmsByPeptide.items(): peptidesForFile[peptide][resultfile] = psms outputByFile = defaultdict(list) for peptide, psmsByFile in peptidesForFile.items(): xicsByFile = [] allPSMs = sum(psmsByFile.values(), []) mass = allPSMs[0]['Predicted mr'] assert len(set(x['Predicted mr'] for x in allPSMs)) == 1 charges = set(x['Charge'] for x in allPSMs) allScans = set([ tuple(x['Spectrum Description'].split('.')[:2]) for x in allPSMs ]) allRTs = set(rawfiles[x[0]].scan_time_from_scan_name(int(x[1])) for x in allScans) minRT, maxRT = min(allRTs), max(allRTs) for resultfile, psms in psmsByFile.items(): rawfile = rawfiles[os.path.basename(resultfile.split('.')[0])] xicInt = 0 for charge in charges: mz = (mass + (1.0072764 * charge)) / charge xic = rawfile.xic(minRT - time_tolerance, maxRT + time_tolerance, mz - tolerance, mz + tolerance) xicInt += sum(zip(*xic)[1]) xicsByFile.append((xicInt, resultfile)) highIntFile = max(xicsByFile, key=lambda x: x[0])[1] outputByFile[highIntFile].append(psmsByFile[highIntFile][0]) for resultfile, psms in outputByFile.items(): outputfile = resultfile[:-5] + '.XIC_localized.xlsx' output = writer(outputfile, columns=columns) for psm in psms: output.write(psm) output.close()