def read_ms1_experiment(filepath): source_experiment = MSExperiment() file_handler = FileHandler() # bytes is required by `loadExperiment()` called below typed_fp = filepath if isinstance(filepath, bytes) else filepath.encode() file_handler.loadExperiment(typed_fp, source_experiment) ms1_experiment = MSExperiment() for spectrum in source_experiment: if spectrum.getMSLevel() == 1: ms1_experiment.addSpectrum(spectrum) return ms1_experiment
def _read_mzml(self, mzml_filename): from pyopenms import MzMLFile, MSExperiment file_handler = MzMLFile() experiment = MSExperiment() file_handler.load(mzml_filename, experiment) spectra = [] for sp in experiment: sp_converted = self._build_spectrum(sp) if sp_converted is not None: sp_converted.meta.update( {'filename': basename(mzml_filename)}) spectra.append(sp_converted) return spectra
def read_sample(input_file, expt_='ecoli', mode='pos'): SM = Sample(expt_, mode, input_file) exp = MSExperiment() MzMLFile().load(input_file, exp) print(input_file, exp.getNrSpectra()) scans = [] ii = 0 for sp in exp: ms_level = sp.getMSLevel() if ms_level == 1: # ONLY dealig with MS1 here rt = sp.getRT() mz, intensity = sp.get_peaks() # use int and tuple to save space when storage is considered intensity = [int(x) for x in intensity] # Scan defined as 'scan_number', 'retention_time', 'mz_values', 'intensity_values' scans.append(Scan(ii, rt, tuple(mz), tuple(intensity))) # scans.append(Scan(ii, rt, mz, intensity)) ii += 1 SM.scans = scans SM.retention_index = tuple([sc.scan_number for sc in scans]) SM.retention_time = tuple([sc.retention_time for sc in scans]) return SM
def readms(file_path): """ Read mzXML, mzML and mzData files. Arguments: file_path: string path to the dataset locally Returns: Tuple of Numpy arrays: (m/z, intensity, retention time, mean interval of retention time). Examples: >>> from hpic.fileio import readms >>> ms,intensity,rt,rt_mean_interval = readms("MM14_20um.mzxml") """ ms_format = os.path.splitext(file_path)[1] ms_format = ms_format.lower() msdata = MSExperiment() if ms_format == '.mzxml': file = MzXMLFile() elif ms_format == '.mzml': file = MzMLFile() elif ms_format == '.mzdata': file = MzDataFile() else: raise Exception('ERROR: %s is wrong format' % file_path) file.load(r'%s' % file_path, msdata) m_s = [] intensity = [] r_t = [] for spectrum in msdata: if spectrum.getMSLevel() == 1: r_t.append(spectrum.getRT()) p_ms = [] p_intensity = [] for peak in spectrum: if peak.getIntensity() != 0: p_ms.append(peak.getMZ()) p_intensity.append(peak.getIntensity()) ms_index = np.argsort(np.negative(p_intensity)) m_s.append(np.array(p_ms)[ms_index]) intensity.append(np.array(p_intensity)[ms_index]) rt1 = np.array(r_t) if rt1.shape[0] > 1: rt_mean_interval = np.mean(np.diff(rt1)) else: rt_mean_interval = 0.0 return m_s, intensity, r_t, rt_mean_interval
def getBasicQuality(exp: oms.MSExperiment, verbose: bool=False) -> mzqc.RunQuality: """ getBasicQuality calculates the basic QualityMetrics from a mass spectrometry peak file and creates the related RunQuality object. Calculated basic QC metrics and proto-metrics necessary to calculate more elaborate QC metrics with additional data (e.g. ID). Parameters ---------- exp : oms.MSExperiment The mass spectrometry peak file to calculate metrics from verbose : bool, optional switches on verbose logging, by default False Returns ------- mzqc.RunQuality A RunQuality object containing the list of metrics calculated and metadata collected, ready for integration into a mzQC file object. """ metrics: List[mzqc.QualityMetric] = list() if exp.getExperimentalSettings().getSourceFiles(): parent_base_name: str = basename(exp.getExperimentalSettings().getSourceFiles()[0].getNameOfFile()) parent_chksm: str = exp.getExperimentalSettings().getSourceFiles()[0].getChecksum() parent_chksm_type: str = exp.getExperimentalSettings().getSourceFiles()[0].getChecksumType() instr_srl: str = exp.getInstrument().getMetaValue('instrument serial number') \ if exp.getInstrument().metaValueExists('instrument serial number') else 'unknown' # MS:1000529 in mzML input_loc: str = exp.getExperimentalSettings().getLoadedFilePath() base_name: str = basename(input_loc) chksm: str = utils.sha256fromfile(exp.getExperimentalSettings().getLoadedFilePath()) cmpltn: str = exp.getDateTime().get() # strt:datetime.datetime = datetime.datetime.strptime(cmpltn, '%Y-%m-%d %H:%M:%S') - datetime.timedelta(seconds=exp.getChromatograms()[0][exp.getChromatograms()[0].size()-1].getRT()*60) meta: mzqc.MetaDataParameters = mzqc.MetaDataParameters( inputFiles=[ mzqc.InputFile(name=base_name,location=input_loc, fileFormat=mzqc.CvParameter("MS", "MS:1000584", "mzML format"), fileProperties=[ mzqc.CvParameter(cvRef="MS", accession="MS:1000747", name="completion time", value=cmpltn ), mzqc.CvParameter(cvRef="MS", accession="MS:1000569", name="SHA-256", value=chksm ), mzqc.CvParameter(cvRef="MS", accession="MS:1000031", name="instrument model", value=exp.getInstrument().getName() ), mzqc.CvParameter(cvRef="MS", accession="MS:1000529", name="instrument serial number", value=instr_srl ) # TODO integrate parent location and checksum # id: MS:1002846 (Associated raw file URI) N.B. definition is PRIDE specific - WTF # fitting checksum cv missing ] ) ], analysisSoftware=[ mzqc.AnalysisSoftware(cvRef="MS", accession="MS:1000752", name="TOPP software", version=oms.__version__, uri="openms.de") ] ) # this is mighty important to sort by RT exp.sortSpectra() min_mz: float = sys.maxsize max_mz: float = 0 mslevelcounts: Dict[int,int] = defaultdict(int) spectrum_acquisition_metrics_MS1: Dict[str,List[Any]] = defaultdict(list) spectrum_acquisition_metrics_MS2: Dict[str,List[Any]] = defaultdict(list) spectrum_topn: Dict[str,List[Any]] = defaultdict(list) tandem_spectrum_metrics_MS2: Dict[str,List[Any]] = defaultdict(list) trap_metrics_MS1: Dict[str,List[Any]] = defaultdict(list) trap_metrics_MS2: Dict[str,List[Any]] = defaultdict(list) isolation_window_metrics: Dict[str,List[Any]] = defaultdict(list) tic_tab: Dict[str,List[Any]] = defaultdict(list) # ActivationMethod look-up dict ams = {getattr(ActivationMethod,i): i for i in dir(ActivationMethod) if type(getattr(ActivationMethod,i))==int } intens_sum: np.float = 0 last_surveyscan_index:int = 0 for spin, spec in enumerate(exp): mslevelcounts[spec.getMSLevel()] += 1 iontraptime = utils.getTrapTime(spec) intens_max = spec.get_peaks()[1].max() intens_min = spec.get_peaks()[1].min() intens_sum = spec.get_peaks()[1].sum() if spec.getMSLevel() == 1: last_surveyscan_index = spin last_surveyscan_intensity = intens_sum last_surveyscan_max = intens_max spectrum_acquisition_metrics_MS1['RT'].append(spec.getRT()) spectrum_acquisition_metrics_MS1['SN'].append(noiseqc.getSN_medianmethod(spec)) spectrum_acquisition_metrics_MS1['peakcount'].append(spec.size()) spectrum_acquisition_metrics_MS1['int'].append(intens_sum.item()) # .item() for dtype to pytype trap_metrics_MS1['RT'].append(spec.getRT()) trap_metrics_MS1['traptime'].append(iontraptime) tic_tab['RT'].append(spec.getRT()) tic_tab['int'].append(intens_sum) if (spec.getMSLevel() == 2): if (spec.getPrecursors()[0].getMZ() < min_mz): min_mz = spec.getPrecursors()[0].getMZ() if (spec.getPrecursors()[0].getMZ() > max_mz): max_mz = spec.getPrecursors()[0].getMZ() spectrum_acquisition_metrics_MS2['RT'].append(spec.getRT()) spectrum_acquisition_metrics_MS2['SN'].append(noiseqc.getSN_medianmethod(spec)) spectrum_acquisition_metrics_MS2['peakcount'].append(spec.size()) spectrum_acquisition_metrics_MS2['int'].append(intens_sum.item()) # .item() for dtype to pytype spectrum_acquisition_metrics_MS2['native_id'].append(utils.spec_native_id(spec)) rank = spin - last_surveyscan_index spectrum_acquisition_metrics_MS2['rank'].append(rank) trap_metrics_MS2['RT'].append(spec.getRT()) trap_metrics_MS2['traptime'].append(iontraptime) trap_metrics_MS2['activation_method'].append(ams.get(next(iter(spec.getPrecursors()[0].getActivationMethods()), None),'unknown')) trap_metrics_MS2['activation_energy'].append(spec.getPrecursors()[0].getMetaValue('collision energy') if \ spec.getPrecursors()[0].metaValueExists('collision energy') else -1) precursor_index = np.searchsorted(exp[last_surveyscan_index].get_peaks()[0], [exp[spin].getPrecursors()[0].getMZ()])[0] if precursor_index != np.array(exp[last_surveyscan_index].get_peaks()).shape[1]: precursor_err = spec.getPrecursors()[0].getMZ() - np.array(exp[last_surveyscan_index].get_peaks())[:,precursor_index][0] precursor_int = np.array(exp[last_surveyscan_index].get_peaks())[:,precursor_index][1] else: precursor_err = np.nan precursor_int = np.nan tandem_spectrum_metrics_MS2['RT'].append(spec.getRT()) tandem_spectrum_metrics_MS2['precursor_intensity'].append(precursor_int) # TODO different from mzid->mzml getPrecursors[0].getIntensity() ? YES, latter one usually zero tandem_spectrum_metrics_MS2['precursor_error'].append(precursor_err) tandem_spectrum_metrics_MS2['precursor_mz'].append(spec.getPrecursors()[0].getMZ()) tandem_spectrum_metrics_MS2['precursor_c'].append(spec.getPrecursors()[0].getCharge()) tandem_spectrum_metrics_MS2['surveyscan_intensity_sum'].append(last_surveyscan_intensity) tandem_spectrum_metrics_MS2['surveyscan_intensity_max'].append(last_surveyscan_max) isolation_window_metrics['RT'].append(spec.getRT()) isolation_window_metrics['isolation_target'].append(spec.getPrecursors()[0].getMZ()) # https://github.com/OpenMS/OpenMS/blob/d17cc251fd0c4068eb253b03c9fb107897771fdc/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp#L1992 isolation_window_metrics['isolation_lower'].append(spec.getPrecursors()[0].getIsolationWindowLowerOffset()) isolation_window_metrics['isolation_upper'].append(spec.getPrecursors()[0].getIsolationWindowUpperOffset()) lower = spec.getPrecursors()[0].getMZ() - spec.getPrecursors()[0].getIsolationWindowLowerOffset() upper = spec.getPrecursors()[0].getMZ() + spec.getPrecursors()[0].getIsolationWindowUpperOffset() s = np.array([(i.getMZ(),i.getIntensity()) for i in exp[last_surveyscan_index]], ndmin = 2) s = s[np.where(np.logical_and(s[:, 0]>=lower, s[:, 0]<=upper))[0]] isolation_window_metrics['peaks_in_window'].append(np.shape(s)[0]) int_sort_desc = np.flip(np.argsort(s[:,1])) if np.shape(s)[0] > 1: isolation_window_metrics['int_ratio_ranked_peaks_in_window'].append( s[int_sort_desc][:-1,1]/s[int_sort_desc][1:,1][0]) # intensity ratio between top1&2, 2&3, ... else: isolation_window_metrics['int_ratio_ranked_peaks_in_window'].append(0) # bigger is better, though best is 0 isolation_window_metrics['summed_window_intensity'].append(np.sum(s[int_sort_desc][:,1])) isolation_window_metrics['isolation_target_intensity'].append(spec.getPrecursors()[0].getIntensity()) # TODO this needs to go outside tol = 0.5 if spec.metaValueExists('filter string'): if 'FTMS' in spec.getMetaValue('filter string'): tol = 0.05 elif 'ITMS' in spec.getMetaValue('filter string'): tol = 0.5 elif 'QTOF' in spec.getMetaValue('filter string'): #TOFMS, SQMS, TQMS, SectorMS tol = 0.1 # ms2 peaks directly from isolation window? unfragmented = np.any([np.isclose(i[0],[x.getMZ() for x in spec], atol=tol) for i in s]) isolation_window_metrics['peaks_in_window_in_ms2'].append(str(unfragmented)) ## Spectra detail numbers metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Spectrum acquisition metric values - MS1", value=spectrum_acquisition_metrics_MS1) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Spectrum acquisition metric values - MS2", value=spectrum_acquisition_metrics_MS2) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Spectra topn ranks", value=spectrum_topn) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Tandem spectrum metric values - MS2", value=tandem_spectrum_metrics_MS2) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Trap metric values - MS1", value=trap_metrics_MS1) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Trap metric values - MS2", value=trap_metrics_MS2) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="isolation window metrics", value=isolation_window_metrics) ) ## Spectra numbers for levels in mslevelcounts.keys(): metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Number of MS{l} spectra".format(l=str(levels)), value=mslevelcounts[levels]) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Number of chromatograms", value=len(exp.getChromatograms())) ) ## Ranges metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="MZ aquisition range", value=[min_mz,max_mz]) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="RT aquisition range", value=[exp[0].getRT(),exp[exp.size()-1].getRT()]) ) # TIC metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Total ion current", value=tic_tab) ) # Chrom chrom_tab: Dict[str,List[Any]] = defaultdict(list) chroms = exp.getChromatograms() for t in chroms: if t.getChromatogramType() == oms.ChromatogramSettings.ChromatogramType.TOTAL_ION_CURRENT_CHROMATOGRAM: for chro_peak in t: chrom_tab['RT'].append(chro_peak.getRT()) chrom_tab['int'].append(chro_peak.getIntensity()) break metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Chromatogram", value=chrom_tab) ) # TODO is there a difference between TIC as defined in MS:1000235 and the chromatogram you get from TRP?? In MZML it says its a MS:1000235 (ion current detected in each of a series of mass spectra) but is it? # TODO consider collection of spectrum_native_id return mzqc.RunQuality(metadata=meta, qualityMetrics=metrics)
def test001(self): exp = MSExperiment() basename = "SHORT_MS2_FILE.mzData" here = os.path.dirname(os.path.abspath(__file__)) FileHandler().loadExperiment(os.path.join(here, "data", basename), exp) assert exp.size() > 0 pc = Precursor() pc.setMZ(1.0) pc.setIntensity(100) s0 = exp[0] s0.setPrecursors([pc]) s0.setMSLevel(2) spec = Spectrum.fromMSSpectrum(s0) settings = InstrumentSettings() settings.setPolarity(IonSource.Polarity.POSITIVE) s0.setInstrumentSettings(settings) self.compare_specs(spec, s0) specneu = Spectrum.fromMSSpectrum(spec.toMSSpectrum()) self.compare_specs(specneu, s0) pm = PeakMap.fromMSExperiment(exp) assert os.path.basename(pm.meta["source"]) == basename rtmin, rtmax = pm.rtRange(None) ms1s = pm.msNPeaks(1, rtmin, rtmax) assert ms1s.shape == (1797, 2), ms1s.shape ms1s2 = pm.msNPeaks(1, rtmax=rtmax) assert np.all(ms1s == ms1s2) ms1s3 = pm.msNPeaks(1, rtmin=0) assert np.all(ms1s == ms1s3) spec = pm.spectra[0] assert len(list(spec)) == len(spec) # calls iter allrts = pm.allRts() assert allrts[0] <= pm.rtRange()[0] assert allrts[-1] >= pm.rtRange()[1] assert len(allrts) == 41, len(allrts) level1 = pm.levelNSpecs(1, 1) level2 = pm.levelNSpecs(2, 2) level12 = pm.levelNSpecs(1, 2) assert len(level1) > 0 assert len(level2) > 0 assert len(level1) + len(level2) == len(level12) == len(pm) assert level1[0].msLevel == 1 assert level2[0].msLevel == 2 # use default arg: nmax = nmin if not provided: level1 = pm.levelNSpecs(1) level2 = pm.levelNSpecs(2) assert len(level1) > 0 assert len(level2) > 0 assert len(level1) + len(level2) == len(level12) == len(pm) assert level1[0].msLevel == 1 assert level2[0].msLevel == 2 lone = pm.levelOneRts() assert len(lone) == len(level1) self.compare_exp(pm, exp, basename) pm2 = PeakMap.fromMSExperiment(pm.toMSExperiment()) self.compare_exp(pm2, exp, basename) pm2 = pm.extract(rtmin=rtmin + .000001) assert len(pm2) == len(pm) - 1 pm2 = pm2.extract(rtmax=rtmax - 0.000001) assert len(pm2) == len(pm) - 2 mzmin, mzmax = pm.mzRange(2) assert mzmin < 250 assert mzmax > 860 mzmin, mzmax = pm.mzRange(1) assert mzmin >= 700 assert mzmax <= 1050 pm2 = pm.extract(rtmin + 0.00001, mzmin=300) mzmin2, mzmax2 = pm2.mzRange() assert mzmin2 >= 300 assert mzmax2 == mzmax pm2 = pm.extract(rtmin=rtmin + 0.000001, mzmin=300, mzmax=1000) mzmin2, mzmax2 = pm2.mzRange() assert mzmin2 >= 300 assert mzmax2 <= 1000 with pytest.raises(Exception): pm.spectra[0].peaksInRange() pp1 = pm.spectra[0].peaksInRange(mzmax=10000) pp2 = pm.spectra[0].peaksInRange(mzmin=0) assert np.all(pp1 == pp2) specs0 = list(pm.spectra) specs1 = pm.specsInRange(0, 99999) specs2 = pm.specsInRange(0, specs0[0].rt) specs3 = pm.specsInRange(specs0[-1].rt, 999999) assert specs0 == specs1 assert specs2 == [specs0[0]] assert specs3 == [specs0[-1]] pm.spectra[0].polarity = "+" pm.spectra[1].polarity = "-" pm = PeakMap(pm.spectra) mz = pm.representingMzPeak(0, 99999, 0, 99999) assert abs(mz - 831.86538) < 0.0001
def test001(self): exp = MSExperiment() basename = "SHORT_MS2_FILE.mzData" FileHandler().loadExperiment(os.path.join("data", basename), exp) assert exp.size()>0 pc = Precursor() pc.setMZ(1.0) pc.setIntensity(100) s0 = exp[0] s0.setPrecursors([pc]) s0.setMSLevel(2) spec = Spectrum.fromMSSpectrum(s0) settings = InstrumentSettings() settings.setPolarity(IonSource.Polarity.POSITIVE) s0.setInstrumentSettings(settings) self.compare_specs(spec, s0) specneu = Spectrum.fromMSSpectrum(spec.toMSSpectrum()) self.compare_specs(specneu, s0) pm = PeakMap.fromMSExperiment(exp) assert os.path.basename(pm.meta["source"]) == basename rtmin, rtmax = pm.rtRange() ms1s = pm.msNPeaks(1, rtmin, rtmax) assert ms1s.shape == (1797, 2), ms1s.shape ms1s2 = pm.msNPeaks(1, rtmax=rtmax) assert np.all(ms1s == ms1s2) ms1s3 = pm.msNPeaks(1, rtmin=0) assert np.all(ms1s == ms1s3) spec = pm.spectra[0] assert len(list(spec)) == len(spec) # calls iter allrts = pm.allRts() assert (allrts[0], allrts[-1]) == pm.rtRange() assert len(allrts) == 41, len(allrts) level1 = pm.levelNSpecs(1, 1) level2 = pm.levelNSpecs(2, 2) level12 = pm.levelNSpecs(1, 2) assert len(level1) > 0 assert len(level2) > 0 assert len(level1) + len(level2) == len(level12) == len(pm) assert level1[0].msLevel == 1 assert level2[0].msLevel == 2 # use default arg: nmax = nmin if not provided: level1 = pm.levelNSpecs(1) level2 = pm.levelNSpecs(2) assert len(level1) > 0 assert len(level2) > 0 assert len(level1) + len(level2) == len(level12) == len(pm) assert level1[0].msLevel == 1 assert level2[0].msLevel == 2 lone = pm.levelOneRts() assert len(lone) == len(level1) self.compare_exp(pm, exp, basename) pm2 = PeakMap.fromMSExperiment(pm.toMSExperiment()) self.compare_exp(pm2, exp, basename) pm2 = pm.extract(rtmin = rtmin+.000001) assert len(pm2) == len(pm)-1 pm2 = pm2.extract(rtmax = rtmax-0.000001) assert len(pm2) == len(pm)-2 mzmin, mzmax = pm.mzRange() assert mzmin < 250 assert mzmax > 1049 pm2 = pm.extract(rtmin+0.00001, mzmin=300) mzmin2, mzmax2 = pm2.mzRange() assert mzmin2 >= 300 assert mzmax2 == mzmax pm2 = pm.extract(rtmin = rtmin+0.000001, mzmin=300, mzmax=1000) mzmin2, mzmax2 = pm2.mzRange() assert mzmin2 >= 300 assert mzmax2 <= 1000 exceptionwrapper(lambda: pm.spectra[0].peaksInRange()) pp1 = pm.spectra[0].peaksInRange(mzmax = 10000) pp2 = pm.spectra[0].peaksInRange(mzmin = 0) assert np.all(pp1 == pp2) specs0 = pm.spectra[:] specs1 = pm.specsInRange(0, 99999) specs2 = pm.specsInRange(0, specs0[0].rt) specs3 = pm.specsInRange(specs0[-1].rt, 999999) assert specs0 == specs1 assert specs2 == [ specs0[0] ] assert specs3 == [ specs0[-1] ] pm.spectra[0].polarity = "+" pm.spectra[1].polarity = "-" pm = PeakMap(pm.spectra) mz = pm.representingMzPeak(0, 99999, 0, 99999) assert abs(mz-831.86538) < 0.00001
def write_mzml(file_name, spectra, title_prefix='Spectrum ', output_dir='./'): if '.mzml' not in file_name.lower(): file_name += '.mzML' output_file = output_dir + file_name exp = MSExperiment() sp_count = 0 ids = {} tic = [] for spectrum in spectra: spec = MSSpectrum() spec.setMSLevel(2) name = title_prefix + str( sp_count) if 'name' not in spectrum else spectrum['name'] ids[sp_count] = name sp_count += 1 if 'abundance' not in spectrum: print('oops') i = [500 for _ in spectrum['spectrum']] else: i = spectrum['abundance'] tic.append(sum(i)) spec.set_peaks([spectrum['spectrum'], i]) spec.setMSLevel(2) prec = Precursor() # check to see if a precursor charge was provided pc = 2 if 'precursor_charge' not in spectrum else spectrum[ 'precursor_charge'] prec.setCharge(pc) prec.setMZ(spectrum['precursor_mass']) spec.setPrecursors([prec]) spec.sortByPosition() exp.addSpectrum(spec) MzMLFile().store(output_file, exp) # load it back and fix it tree = ET.parse(output_file) mzml = tree.find('{http://psi.hupo.org/ms/mzml}mzML') # add an id to the mzml mzml.set('id', 'testSpectraFileFixed') mzml.set('xmlns', "http://psi.hupo.org/ms/mzml") mzml.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance') # remove some info from the mzml header [accession, version] toDelMzml = ['accession'] for todel in toDelMzml: del mzml.attrib[todel] # add file description stuff to the fileDescription tag fileDescription = mzml.find('{http://psi.hupo.org/ms/mzml}fileDescription') sourceFileElement = ET.fromstring( '<sourceFileList count="1"> <sourceFile id="NOD2_E3.mzXML" name="NOD2_E3.mzXML" location="file:///C:/Users/zachmcgrath/Downloads"> <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/> <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/> <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="89ecb0dd31ca3a2fdf5ef2c4f5341f6e5e9f06f0"/> </sourceFile> </sourceFileList>' ) fileDescription.append(sourceFileElement) instrumentConfigurationList = mzml.find( '{http://psi.hupo.org/ms/mzml}instrumentConfigurationList') instrumentConfiguration = ET.fromstring( ' <instrumentConfiguration id="IC1"> <componentList count="3"> <source order="1"> <userParam name="msIonisation" value="HPLC-Chip/MS"/> </source> <analyzer order="1"> <userParam name="msMassAnalyzer" value="Q-TOF"/> </analyzer> <detector order="1"> <userParam name="msDetector" value="ADC"/> </detector> </componentList> </instrumentConfiguration>' ) for child in instrumentConfigurationList: del child instrumentConfigurationList.append(instrumentConfiguration) run = mzml.find('{http://psi.hupo.org/ms/mzml}run') spectrumList = run.find('{http://psi.hupo.org/ms/mzml}spectrumList') print(len(spectrumList)) for i, spectrumElement in enumerate(spectrumList): # set the id spectrumElement.set('id', ids[i]) # need to add this # <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/> centroidElement = ET.Element('{http://psi.hupo.org/ms/mzml}cvParam', attrib={ 'name': 'centroid spectrum', 'accession': 'MS:1000127', 'value': '' }) pl = spectrumElement.find('{http://psi.hupo.org/ms/mzml}precursorList') bdal = spectrumElement.find( '{http://psi.hupo.org/ms/mzml}binaryDataArrayList') if bdal is None or pl is None: print(spectrumElement) spectrumList.remove(spectrumElement) # add the range. Not good for MSConvert for whatever reason scanListEl = spectrumElement.find( '{http://psi.hupo.org/ms/mzml}scanList') scanEl = scanListEl.find('{http://psi.hupo.org/ms/mzml}scan')[0] scanWindowListElement = ET.fromstring( '<scanWindowList count="1"> <scanWindow> <cvParam cvRef="MS" accession="MS:1000501" value="0" name="scan window lower limit" unitAccession="MS:1000040" unitName="m/z" unitCvRef="MS" /> <cvParam cvRef="MS" accession="MS:1000500" value="10000" name="scan window upper limit" unitAccession="MS:1000040" unitName="m/z" unitCvRef="MS" /> </scanWindow></scanWindowList>' ) scanEl.append(scanWindowListElement) spectrumElement.append(centroidElement) timearray = base64.encodebytes( struct.pack(f'<{len(tic)}f', *[i + 1 for i in range(len(tic))])) ticarray = base64.encodebytes(struct.pack(f'<{len(tic)}f', *tic)) chromatogramListElmentString = f' <chromatogramList count="1" defaultDataProcessingRef="mzLibProcessing"> <chromatogram id="TIC" index="0" defaultArrayLength="1025" dataProcessingRef="mzLibProcessing"> <cvParam cvRef="MS" accession="MS:1000235" value="" name="total ion current chromatogram" /> <binaryDataArrayList count="2"> <binaryDataArray encodedLength="10936"> <cvParam cvRef="MS" accession="MS:1000523" value="" name="64-bit float" /> <cvParam cvRef="MS" accession="MS:1000576" value="" name="no compression" /> <cvParam cvRef="MS" accession="MS:1000595" value="" name="time array" unitAccession="UO:0000031" unitName="Minutes" unitCvRef="UO" /> <binary>{timearray}</binary> </binaryDataArray> <binaryDataArray encodedLength="10936"> <cvParam cvRef="MS" accession="MS:1000523" value="" name="64-bit float" /> <cvParam cvRef="MS" accession="MS:1000576" value="" name="no compression" /> <cvParam cvRef="MS" accession="MS:1000515" value="" name="intensity array" unitAccession="MS:1000131" unitName="number of counts" unitCvRef="MS" /> <binary>{ticarray}</binary> </binaryDataArray> </binaryDataArrayList> </chromatogram> </chromatogramList>' chromatogramListElment = ET.fromstring(chromatogramListElmentString) run.append(chromatogramListElment) tree.write(output_file, encoding='utf-8', xml_declaration=True) return output_file
def create(in_file="", in_featureXML="", spec_out="spectrum.png", label_out="labels.png", feat_out="features.png", hdf5_out="data.h5", width=1024, height=1024, rt_step_size=None, mz_step_size=None, save_png=False): """Create .png images for spectra, labels and bounding boxes. Spectra are read from mzML file whereas labels and bounding boxes are read from featureXML file. Labels image is not generated by OpenMS's ImageCreator. Arguments: in_file: file name of input spectrum file (.mzML) in_featureXML: file name of input features file (.featureXML) spec_out: name for output spectrum image label_out: name for output label image feat_out: name for output features image hdf5_out: name for HDF5 file width: Number of pixels in m/z dimension if 0, width defined w.r.t. mz_step_size (default: 1024, min: 0) height: Number of pixels in r/t dimension if 0, height defined w.r.t rt_step_size (default: 1024, min: 0) rt_step_size: step_size for discrete r/t values, used only if width is 0 (default 1) mz_step_size: step_size for discrete m/z values, used only if height is 0 (default 0.1) save_png: if False, png images will not be saved """ if in_file == "": raise AttributeError('Input mzML file not specified') if in_featureXML == "": raise AttributeError('Input featureXML file not specified.') if not spec_out.endswith('.png'): raise AttributeError('spec_out should have .png format') if not feat_out.endswith('.png'): raise AttributeError('feat_out should have .png format') if not label_out.endswith('.png'): raise AttributeError('label_out should have .png format') if not hdf5_out.endswith('.h5'): raise AttributeError('hdf5_out should have .h5 format') if width < 0: raise AttributeError('width should be >= 0') if height < 0: raise AttributeError('height should be >= 0') # load mzML file print "Reading", in_file, "..." exp = MSExperiment() MzMLFile().load(in_file, exp) print "Number of Spectra = " + str(exp.size()) print "Reading", in_featureXML, "..." # load featureXML feat_map = FeatureMap() FeatureXMLFile().load(in_featureXML, feat_map) print "number of peptide features = " + str(feat_map.size()) # create Spectrum Dictionary and store ranges: exp.updateRanges(1) max_rt, max_mz = exp.getMax() min_rt, min_mz = exp.getMin() spec_rt_range = (min_rt, max_rt) spec_mz_range = (min_mz, max_mz) spectrum_dict = _create_spec_dict(exp) # create Features Dictionary and store ranges feature_dict = _create_feat_dict(feat_map) min_rt, min_mz = feat_map.getMin() max_rt, max_mz = feat_map.getMax() feat_rt_range = (min_rt, max_rt) feat_mz_range = (min_mz, max_mz) # min_rt = math.floor(min(spec_rt_range[0], # feat_rt_range[0])) # max_rt = math.ceil(max(spec_rt_range[1], # feat_rt_range[1])) # min_mz = math.floor(min(spec_mz_range[0], # feat_mz_range[0])) # max_mz = math.ceil(max(spec_mz_range[1], # feat_mz_range[1])) min_rt, max_rt = spec_rt_range min_mz, max_mz = spec_mz_range # set height according to rt range if height == 0: if rt_step_size is None: rt_step_size = 1 height = int((max_rt - min_rt) / rt_step_size) else: rt_step_size = float(max_rt - min_rt) / height # set width according to mz range if width == 0: if mz_step_size is None: mz_step_size = 0.1 width = int((max_mz - min_mz) / mz_step_size) else: mz_step_size = float(max_mz - min_mz) / width # Extract and grid peak data from MSExperiment # Perform Bilinear Interpolation for spectra bilip = _interpolation(exp) # array for storing specta values using Bilinear Interpolation spec_arr = _create_data_array(min_rt, max_rt, min_mz, max_mz, height, width) spec_arr = _fill_spec_array(spec_arr, bilip, spectrum_dict) # flip the array along rows so that the image resembles TOPPView plot spec_arr[1:, :] = np.flip(spec_arr[1:, :], 0) if save_png is True: print "Saving spectra image", spec_out, "..." spec_arr = _save_spectrum_image(spec_arr, spec_out) print "Saving '/data' group in ", hdf5_out, "..." outfile = h5py.File(hdf5_out) data = outfile.create_dataset("/data", spec_arr[1:, 1:].shape, chunks=True, compression="gzip", dtype=np.float32, data=spec_arr[1:, 1:]) spec_arr = None # free memory used by spec_arr data.attrs['rt_min'] = min_rt data.attrs['rt_max'] = max_rt data.attrs['rt_step_size'] = rt_step_size data.attrs['mz_min'] = min_mz data.attrs['mz_max'] = max_mz data.attrs['mz_step_size'] = mz_step_size # labels array dot_arr = _create_data_array(min_rt, max_rt, min_mz, max_mz, height, width) dot_arr = _insert_dot_values(feature_dict, dot_arr) if save_png is True: # array for storing feature values for png image bbox_arr = np.copy( dot_arr) # bbox rectangle array contains dot values too bbox_arr = _insert_rect_values(feature_dict, bbox_arr) bbox_arr[1:, :] = np.flip(bbox_arr[1:, :], 0) print "Saving features image", feat_out, "..." bbox_arr = _save_features_image(bbox_arr, feat_out) bbox_arr = None # array for storing feature as vectors bbox_vec = _create_bbox_vectors( feature_dict, dot_arr, # bbox vector arr uses dot_arr for index lookup mz_step_size, rt_step_size) dot_arr[1:, :] = np.flip(dot_arr[1:, :], 0) if save_png is True: print "Saving labels image", label_out, "..." dot_arr = _save_labels_image(dot_arr, label_out) print "Saving '/label' group in ", hdf5_out, "..." label = outfile.create_dataset("/label", dot_arr[1:, 1:].shape, chunks=True, compression="gzip", dtype=np.float32, data=dot_arr[1:, 1:]) dot_arr = None bbox_vec = np.flip(bbox_vec, axis=1) print "Saving '/feature' group in", hdf5_out, "..." feature = outfile.create_dataset("/feature", bbox_vec.shape, chunks=True, compression="gzip", dtype=np.float32, data=bbox_vec) bbox_vec = None outfile.close()
def store(self, ofname: str, peakMap: pyopenms.MSExperiment): #sort peakMap if necessary if not peakMap.isSorted(): peakMap.sortSpectra() peakMap.updateRanges() outF = open(ofname, 'w') firstScan = self._getScan( peakMap.getSpectrum(0).getNativeID().decode('utf-8')) dataType = peakMap.getSpectrum(0).getType() dataType = 'Centroid' if dataType == 1 else 'Profile' if dataType == 2 else 'Unknown' lastScan = self._getScan( peakMap.getSpectrum(peakMap.getNrSpectra() - 1).getNativeID().decode('utf-8')) precursorFile = basename(ofname).replace('ms2', 'ms1') #print header outF.write( self._writeValue( MS2File._h_tag, 'CreationDate', datetime.datetime.now().strftime('%m/%d/%Y %I:%M:%S %p'))) outF.write(self._writeValue(MS2File._h_tag, 'Extractor', 'msConvert')) outF.write(self._writeValue(MS2File._h_tag, 'ExtractorVersion', '0.1')) outF.write( self._writeValue(MS2File._h_tag, 'Comments', 'msConvert was written by Aaron Maurais, 2019')) outF.write(self._writeValue(MS2File._h_tag, 'ExtractorOptions', 'MS2')) outF.write( self._writeValue(MS2File._h_tag, 'AcquisitionMethod', 'Data-Dependent')) outF.write( self._writeValue(MS2File._h_tag, 'InstrumentType', 'Unknown')) outF.write(self._writeValue(MS2File._h_tag, 'ScanType', 'MS2')) outF.write(self._writeValue(MS2File._h_tag, 'DataType', dataType)) outF.write(self._writeValue(MS2File._h_tag, 'FirstScan', firstScan)) outF.write(self._writeValue(MS2File._h_tag, 'LastScan', lastScan)) #iterate through spectra preScan = 'Unknown' for i, scan in enumerate(peakMap.getSpectra()): if scan.getMSLevel() == 1: preScan = self._getScan(scan.getNativeID().decode('utf-8')) if scan.getMSLevel() == 2: #write header info curScan = self._getScan(scan.getNativeID().decode('utf-8')) precursors = scan.getPrecursors() preCharge = int(precursors[0].getCharge()) preMZ = precursors[0].getMZ() #print scan line outF.write('{}\t{}\t{}\t{}\n'.format(MS2File._s_tag, curScan.zfill(6), curScan.zfill(6), preMZ)) #print scan info outF.write( self._writeValue(MS2File._i_tag, 'RetTime', scan.getRT())) outF.write( self._writeValue(MS2File._i_tag, 'PrecursorInt', precursors[0].getIntensity())) outF.write( self._writeValue(MS2File._i_tag, 'IonInjectionTime', 'Unknown')) ameth = list(precursors[0].getActivationMethods()) ameth = ' '.join( [MS2File._activationMethods[x] for x in ameth]) if not ameth: ameth = 'Unknown' outF.write( self._writeValue(MS2File._i_tag, 'ActivationType', ameth)) outF.write( self._writeValue(MS2File._i_tag, 'PrecursorFile', precursorFile)) outF.write( self._writeValue(MS2File._i_tag, 'PrecursorScan', preScan)) outF.write( self._writeValue(MS2File._i_tag, 'InstrumentType', 'Unknown')) #write z line #after charge, the M+H m/z for the ion is listed, so calculate that here outF.write( self._writeValue( MS2File._z_tag, preCharge, (float(preMZ) * preCharge) - (preCharge * MS2File._H_mass) + MS2File._H_mass)) #write ions for ion in scan: outF.write('{0:.4f} {1:.1f}\n'.format( round(ion.getMZ(), 4), round(ion.getIntensity(), 1)))
from pyopenms import MSExperiment, MzXMLFile import pandas as pd labview_file = "/home/james/Downloads/20200124_TP_2.csv" mzXML_file = "/home/james/Downloads/20200124_17645.mzXML" # # Make smaller mzXML file to keep in repo # n_spectra_to_keep = 20 exp = MSExperiment() MzXMLFile().load(mzXML_file, exp) print(f"Full size of experiment {len(exp.getSpectra())}") spec = [] for i, s in enumerate(exp.getSpectra()): if i == n_spectra_to_keep: break spec.append(s) exp.setSpectra(spec) print(f"New size of experiment {len(exp.getSpectra())}") MzXMLFile().store("test.mzXML", exp) # # Take subset of CSV LabView data # cols = ["time", "b", "temp", "d", "e", "f", "g", "h"] df = pd.read_csv(labview_file, names=cols) df["time"] -= df["time"][0]
def load(self, ifname: str, peakMap: pyopenms.MSExperiment): inF = open(ifname, 'r') lines = inF.read().splitlines() curLine = 0 nLines = len(lines) #generate spectrum list spectraList = list() while curLine < nLines: if lines[curLine] == 'BEGIN IONS': spectrum = pyopenms.MSSpectrum() spectrum.setMSLevel(2) precursor = pyopenms.Precursor() curLine += 1 while curLine < nLines: if lines[curLine][0].isalpha(): match = re.search('^([A-Z]+)=(.+)$', lines[curLine]) if match.group(1) == 'TITLE': titleData = match.group(2).split(',') for s in titleData: if re.search('^scan[_=]', s): match = re.search('^scan[_=]([0-9]+)', s) assert (len(match.groups()) == 1) spectrum.setNativeID('scan={}'.format( match.group(1))) elif match.group(1) == 'PEPMASS': preMZ = [ float(x) for x in match.group(2).split(' ') ] assert (len(preMZ) <= 2) precursor.setMZ(preMZ[0]) if len(preMZ) > 1: precursor.setIntensity(preMZ[1]) elif match.group(1) == 'CHARGE': match = re.search('^([0-9])[+-]{0,1}$', match.group(2)) assert (len(match.groups()) == 1) precursor.setCharge(int(match.group(1))) elif match.group(1) == 'RTINSECONDS': spectrum.setRT(float(match.group(2))) elif lines[curLine][0].isnumeric(): while curLine < nLines and lines[curLine] != 'END IONS': ion = [float(x) for x in lines[curLine].split(' ')] assert (len(ion) == 2) ion_temp = pyopenms.Peak1D() ion_temp.setMZ(ion[0]) ion_temp.setIntensity(ion[1]) spectrum.push_back(ion_temp) curLine += 1 break curLine += 1 spectrum.setPrecursors([precursor]) spectraList.append(spectrum) curLine += 1 peakMap.setSpectra(spectraList)
def combine_experiments(exp1: ms.MSExperiment, exp2: ms.MSExperiment) -> None: """Merges two experiments (putting the second into the first).""" for i in range(exp2.getNrSpectra()): spec = exp2.getSpectrum(i) exp1.addSpectrum(spec)
def find_spectral_groups(input_file, MS1_round_precision=2, min_match_peaks=1, ms2_tol=0.1, score_thresh=0.8, verbose=False): from pyopenms import MSExperiment, MzMLFile from mnet import Spectrum, Cluster from scoring_functions import fast_cosine print("Finding groups of spectra in {}".format(input_file)) exp = MSExperiment() MzMLFile().load(input_file, exp) clusters = { } # dictionary, key is precursor mz, value is a list of clusters # loop over spectra cl_id = 0 n_spec = 0 biggest_cluster_size = 0 for ind, spectrum in enumerate(exp): if spectrum.getMSLevel() != 1: raw_precursormz = spectrum.getPrecursors()[0].getMZ() precursormz = round(spectrum.getPrecursors()[0].getMZ(), MS1_round_precision) if not precursormz in clusters: clusters[precursormz] = [] # make a Spectrum object peaks = zip(spectrum.get_peaks()[0], spectrum.get_peaks()[1]) s = Spectrum(peaks, input_file, ind, None, raw_precursormz, raw_precursormz, rt=spectrum.getRT()) if len(clusters[precursormz]) == 0: # no clusters exist, make one new_cluster = Cluster(s, cl_id) cl_id += 1 clusters[precursormz].append(new_cluster) else: #match_ers exist, compute the similarity between the spectrum and each best_score = -1 best_pos = None for pos, cl in enumerate(clusters[precursormz]): sc, _ = fast_cosine(s, cl, ms2_tol, min_match_peaks) if sc > best_score: best_score = sc best_pos = pos if best_score >= score_thresh: clusters[precursormz][pos].add_spectrum(s) if clusters[precursormz][ pos].n_spectra > biggest_cluster_size: biggest_cluster_size = clusters[precursormz][ pos].n_spectra else: new_cluster = Cluster(s, cl_id) cl_id += 1 clusters[precursormz].append(new_cluster) n_spec += 1 if verbose and n_spec % 100 == 0: print( "{} Masses, {} clusters, biggest cluster = {} spectra".format( len(clusters), cl_id, biggest_cluster_size)) return clusters