def test_encodeNP_SLOF(self): """ String out; MSNumpressCoder::NumpressConfig config; config.np_compression = MSNumpressCoder::SLOF; config.estimate_fixed_point = true; // critical bool zlib_compression = false; MSNumpressCoder().encodeNP(in, out, zlib_compression, config); TEST_EQUAL(out.size(), 24) TEST_EQUAL(out, "QMVagAAAAAAZxX3ivPP8/w==") """ coder = pyopenms.MSNumpressCoder() config = pyopenms.NumpressConfig() config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.SLOF config.estimate_fixed_point = True out_ = pyopenms.String() coder.encodeNP(self.testData, out_, False, config) out = out_.c_str() self.assertEqual(len(out), 24) self.assertEqual(out, "QMVagAAAAAAZxX3ivPP8/w==")
def psm_df_mgf(input_map, theoretical, max_delta_ppm, scan_id, modified_peptide, precursor_charge): ionseries = theoretical[modified_peptide][precursor_charge] spectrum = input_map[scan_id] top_delta = 30 ions, ion_masses = ionseries mzs0, intensities0 = spectrum[:, 0], spectrum[:, 1] ppms = np.abs((mzs0[:, np.newaxis] - ion_masses) / ion_masses * 1e6) idx_mask = (ppms < min(max_delta_ppm, top_delta)).any(1) idx = ppms[idx_mask].argmin(1) fragments, product_mzs, intensities = ions[idx], ion_masses[ idx], intensities0[idx_mask] # Baseline normalization to highest annotated peak max_intensity = np.amax(intensities, initial=0.0) if max_intensity > 0: intensities /= max_intensity intensities *= 10000 return [ len(fragments), fragments, product_mzs, intensities, scan_id, po.AASequence.fromString(po.String(modified_peptide)).getMonoWeight( po.Residue.ResidueType.Full, precursor_charge) / precursor_charge, modified_peptide, precursor_charge ]
def testParamEntry(): # as ParamEntry::isValid takes "String &" as input argument, which # can not be implemened by a Python string, here no automatic # conversion from a basestring should happen: p = pyopenms.ParamEntry() message = pyopenms.String() assert p.isValid(message) assert message.c_str() == b""
def read_tims_mgf(tims_mgf_path, psms, theoretical, max_delta_ppm): # read MGF import mmap record_pattern = re.compile(b'''BEGIN IONS\r? (.*?) END IONS''', re.MULTILINE | re.DOTALL) scan_num_pattern = re.compile(b'TITLE=Cmpd\s+([0-9]+),') peaks_pattern = re.compile(b'^([\\d.]+)\s+([\\d.]+)', re.MULTILINE) tims_data = {} with open(tims_mgf_path, "rb") as f: mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) for e in record_pattern.finditer(mm): rec = e.group(1) scan_num_findall = scan_num_pattern.findall(rec) if len(scan_num_findall) == 1: scan_num = int(scan_num_findall[0]) else: raise RuntimeError("Cannot find Cmpd number from " + rec) tims_data[scan_num] = np.array(peaks_pattern.findall(rec), dtype=float) peaks_list = [] for scan_id, modified_peptide, precursor_charge in psms.itertuples(index=False): ionseries = theoretical[modified_peptide][precursor_charge] mz_intensity_array = tims_data[scan_id] fragments = [] product_mzs = [] intensities = [] for mz, intensity in mz_intensity_array: fragment, product_mz = annotate_mass(mz, ionseries, max_delta_ppm) if fragment is not None: fragments.append(fragment) product_mzs.append(product_mz) intensities.append(intensity) peaks = pd.DataFrame({'fragment': fragments, 'product_mz': product_mzs, 'intensity': intensities}) peaks['scan_id'] = scan_id peaks['precursor_mz'] = po.AASequence.fromString(po.String(modified_peptide)).getMonoWeight(po.Residue.ResidueType.Full, precursor_charge) / precursor_charge; peaks['modified_peptide'] = modified_peptide peaks['precursor_charge'] = precursor_charge # Baseline normalization to highest annotated peak peaks['intensity'] = peaks['intensity'] * (10000 / np.max(peaks['intensity'])) peaks_list.append(peaks) if len(peaks_list) > 0: transitions = pd.concat(peaks_list) # Multiple peaks might be identically annotated, only use most intense transitions = transitions.groupby(['scan_id','modified_peptide','precursor_charge','precursor_mz','fragment','product_mz'])['intensity'].max().reset_index() else: transitions = pd.DataFrame({'scan_id': [], 'modified_peptide': [], 'precursor_charge': [], 'precursor_mz': [], 'fragment': [], 'product_mz': [], 'intensity': []}) return transitions
def test_encodeNP_LINEAR(self): coder = pyopenms.MSNumpressCoder() config = pyopenms.NumpressConfig() config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.LINEAR config.estimate_fixed_point = True out_ = pyopenms.String() coder.encodeNP(self.testData, out_, False, config) out = out_.c_str() self.assertEqual(len(out), 28) self.assertEqual(out, "QWR64UAAAADo//8/0P//f1kSgA==")
def test_encodeNP_PIC(self): coder = pyopenms.MSNumpressCoder() config = pyopenms.NumpressConfig() config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.PIC config.estimate_fixed_point = True out_ = pyopenms.String() coder.encodeNP(self.testData, out_, False, config) out = out_.c_str() self.assertEqual(len(out), 12) self.assertEqual(out, "ZGaMXCFQkQ==")
def test_encodeNP_PIC(self): coder = pyopenms.MSNumpressCoder() config = pyopenms.NumpressConfig() config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.PIC config.estimate_fixed_point = True out_ = pyopenms.String() coder.encodeNPRaw(self.testData, out_, config) out = out_.c_str() self.assertEqual(len(out), 7) self.assertEqual(out, b'df\x8c\\!P\x91')
def test_encodeNP_LINEAR(self): coder = pyopenms.MSNumpressCoder() config = pyopenms.NumpressConfig() config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.LINEAR config.estimate_fixed_point = True out_ = pyopenms.String() coder.encodeNPRaw(self.testData, out_, config) out = out_.c_str() self.assertEqual(len(out), 19) self.assertEqual( out, b'Adz\xe1@\x00\x00\x00\xe8\xff\xff?\xd0\xff\xff\x7fY\x12\x80')
def test_encodeNP_SLOF(self): coder = pyopenms.MSNumpressCoder() config = pyopenms.NumpressConfig() config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.SLOF config.estimate_fixed_point = True out_ = pyopenms.String() coder.encodeNPRaw(self.testData, out_, config) out = out_.c_str() self.assertEqual(len(out), 16) self.assertEqual( out, b'@\xc5Z\x80\x00\x00\x00\x00\x19\xc5}\xe2\xbc\xf3\xfc\xff')
def generate_ionseries(peptide_sequence, precursor_charge, fragment_charges=[1,2,3,4], fragment_types=['b','y'], enable_specific_losses = False, enable_unspecific_losses = False): peptide = po.AASequence.fromString(po.String(peptide_sequence)) sequence = peptide.toUnmodifiedString() unspecific_losses = ["H2O1","H3N1","C1H2N2","C1H2N1O1"] fragments = {} for fragment_type in fragment_types: for fragment_charge in fragment_charges: if fragment_charge <= precursor_charge: for fragment_ordinal in range(1,len(sequence)): if fragment_type == 'a': ion = peptide.getPrefix(fragment_ordinal) mass = ion.getMonoWeight(po.Residue.ResidueType.AIon, fragment_charge) / fragment_charge; elif fragment_type == 'b': ion = peptide.getPrefix(fragment_ordinal) mass = ion.getMonoWeight(po.Residue.ResidueType.BIon, fragment_charge) / fragment_charge; elif fragment_type == 'c': ion = peptide.getPrefix(fragment_ordinal) mass = ion.getMonoWeight(po.Residue.ResidueType.CIon, fragment_charge) / fragment_charge; elif fragment_type == 'x': ion = peptide.getSuffix(fragment_ordinal) mass = ion.getMonoWeight(po.Residue.ResidueType.XIon, fragment_charge) / fragment_charge; elif fragment_type == 'y': ion = peptide.getSuffix(fragment_ordinal) mass = ion.getMonoWeight(po.Residue.ResidueType.YIon, fragment_charge) / fragment_charge; elif fragment_type == 'z': ion = peptide.getSuffix(fragment_ordinal) mass = ion.getMonoWeight(po.Residue.ResidueType.ZIon, fragment_charge) / fragment_charge; # Standard fragment ions fragments[fragment_type + str(fragment_ordinal) + "^" + str(fragment_charge)] = mass # Losses if enable_specific_losses or enable_unspecific_losses: for lossfragment_ordinal in range(1,ion.size()): if (ion.getResidue(lossfragment_ordinal).hasNeutralLoss()): losses = ion.getResidue(lossfragment_ordinal).getLossFormulas() for loss in losses: loss_type = loss.toString().decode("utf-8") if (enable_specific_losses and loss_type not in unspecific_losses) or (enable_unspecific_losses and loss_type in unspecific_losses): fragments[fragment_type + str(fragment_ordinal) + "-" + loss_type + "^" + str(fragment_charge)] = mass - (loss.getMonoWeight() / fragment_charge) return list(fragments.keys()), np.fromiter(fragments.values(), np.float, len(fragments))
def read_mzml_or_mzxml_impl(path, psms, theoretical, max_delta_ppm, filetype): assert filetype in ('mzml', 'mzxml') fh = po.MzMLFile() if filetype=='mzml' else po.MzXMLFile() fh.setLogType(po.LogType.CMD) input_map = po.MSExperiment() fh.load(path, input_map) peaks_list = [] for ix, psm in psms.iterrows(): scan_id = psm['scan_id'] ionseries = theoretical[psm['modified_peptide']][psm['precursor_charge']] spectrum = input_map.getSpectrum(scan_id - 1) fragments = [] product_mzs = [] intensities = [] for peak in spectrum: fragment, product_mz = annotate_mass(peak.getMZ(), ionseries, max_delta_ppm) if fragment is not None: fragments.append(fragment) product_mzs.append(product_mz) intensities.append(peak.getIntensity()) peaks = pd.DataFrame({'fragment': fragments, 'product_mz': product_mzs, 'intensity': intensities}) peaks['scan_id'] = scan_id peaks['precursor_mz'] = po.AASequence.fromString(po.String(psm['modified_peptide'])).getMonoWeight(po.Residue.ResidueType.Full, psm['precursor_charge']) / psm['precursor_charge']; peaks['modified_peptide'] = psm['modified_peptide'] peaks['precursor_charge'] = psm['precursor_charge'] # Baseline normalization to highest annotated peak max_intensity = np.max(peaks['intensity']) if max_intensity > 0: peaks['intensity'] = peaks['intensity'] * (10000 / max_intensity) peaks_list.append(peaks) if len(peaks_list) > 0: transitions = pd.concat(peaks_list) # Multiple peaks might be identically annotated, only use most intense transitions = transitions.groupby(['scan_id','modified_peptide','precursor_charge','precursor_mz','fragment','product_mz'])['intensity'].max().reset_index() else: transitions = pd.DataFrame({'scan_id': [], 'modified_peptide': [], 'precursor_charge': [], 'precursor_mz': [], 'fragment': [], 'product_mz': [], 'intensity': []}) return(transitions)
def psm_df(input_map, theoretical, max_delta_ppm, scan_id, modified_peptide, precursor_charge): ionseries = theoretical[modified_peptide][precursor_charge] spectrum = input_map.getSpectrum(scan_id - 1) fragments, product_mzs, intensities = annotate_mass_spectrum( ionseries, max_delta_ppm, spectrum) # Baseline normalization to highest annotated peak max_intensity = np.amax(intensities, initial=0.0) if max_intensity > 0: intensities /= max_intensity intensities *= 10000 return [ len(fragments), fragments, product_mzs, intensities, scan_id, po.AASequence.fromString(po.String(modified_peptide)).getMonoWeight( po.Residue.ResidueType.Full, precursor_charge) / precursor_charge, modified_peptide, precursor_charge ]
def main(options): # make sure that the ids are "correct" for the testcase date_time = pyopenms.DateTime() if options.test: date_time.set("1999-12-31 23:59:59") pyopenms.UniqueIdGenerator().setSeed(date_time) else: date_time = pyopenms.DateTime.now() exp = pyopenms.MSExperiment() out_map = pyopenms.ConsensusMap() pyopenms.FileHandler().loadExperiment(options.infile, exp) exp.updateRanges() # # 1. filter MS1 level (only keep MS1) # tmp = copy.copy(exp) tmp.clear(False) for spectrum in exp: if spectrum.getMSLevel() == 1: tmp.push_back(spectrum) exp = tmp exp.sortSpectra(True) # # 2. set parameters # analyzer = pyopenms.SILACAnalyzer() analyzer.initialize( # section sample options.selected_labels, options.charge_min, options.charge_max, options.missed_cleavages, options.isotopes_per_peptide_min, options.isotopes_per_peptide_max, # section "algorithm" options.rt_threshold, options.rt_min, options.intensity_cutoff, options.intensity_correlation, options.model_deviation, options.allow_missing_peaks, # labels options.label_identifiers) # # 3. run # analyzer.run_all(exp, out_map) # # 4. set dataprocessing and output meta information # out_map.sortByPosition() dp = out_map.getDataProcessing() p = pyopenms.DataProcessing() p.setProcessingActions( set([ pyopenms.ProcessingAction().DATA_PROCESSING, pyopenms.ProcessingAction().PEAK_PICKING, pyopenms.ProcessingAction().FILTERING, pyopenms.ProcessingAction().QUANTITATION ])) p.setCompletionTime(date_time) sw = p.getSoftware() sw.setName("SILACAnalyzer") if options.test: sw.setVersion("version_string") p.setSoftware(sw) p.setMetaValue("parameter: mode", "test_mode") else: sw.setVersion("pyTOPP v1.10") p.setSoftware(sw) dp.append(p) out_map.setDataProcessing(dp) # # 5. write output # analyzer.writeConsensus(pyopenms.String(options.outfile), out_map)
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-in', dest="inf", help='<Required> full path to the input idXML', required=True) parser.add_argument('-out', dest="out", help="<Required> full path to the csv to be written", required=True) parser.add_argument( '-doi', '--dept_of_immuno-filenames', dest='doi', action='store_true', help= "If filename conforms with the filename scheme of the dept. of immunology... (in doubt, it probably doesn'T)" ) parser.set_defaults(doi=False) parser.add_argument( '-qcML', dest="qcml", action='store_true', help= "If instead of writing a csv embedding in a qcml file - prerequisite: -out must be a existing qcml file" ) parser.set_defaults(qcml=False) options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if not (options.inf or options.out): parser.print_help() sys.exit(1) pros = list() peps = list() f = oms.IdXMLFile() f.load(options.inf, pros, peps) if options.qcml: try: qf = oms.QcMLFile() qf.load(oms.String(options.out)) except: print "no usable qcml file given" sys.exit(1) SE = "NA" SE = pros[0].getSearchEngine() sesmv = "NA" if SE == 'MS-GF+': sesmv = 'MS:1002053' elif SE == 'XTandem': sesmv = 'E-Value' elif SE == 'Comet': sesmv = 'MS:1002257' elif SE == 'Mascot': sesmv = 'EValue' if not sesmv: print "no adequate search engine score found" #sys.exit(1) organ, repli, patient = None, None, None if options.doi: organ = path.basename(options.inf).split('_')[3] repli = path.basename(options.inf).split('_')[6].split('#')[-1] patient = path.basename(options.inf).split('_')[2] if not organ or not repli or not patient: options.doi = False sepe = list() for pep in peps: pep.sort() hits = pep.getHits() st = pep.getScoreType() if st != 'Posterior Error Probability': print "Warning, no PEP as score type found!" if pep.metaValueExists('spectrum_reference'): sn_ref = pep.getMetaValue('spectrum_reference') sn = sn_ref.split('=')[-1] else: sn = '?' sn_ref = '?' for i, h in enumerate(hits): row = dict() row['file'] = path.basename(options.inf) row['sequence'] = h.getSequence().toUnmodifiedString() row['modified'] = h.getSequence().isModified() row['rank'] = i + 1 row['spectrum'] = sn row['spectrum_reference'] = sn_ref # match with spectrum_native_id von featureXML and spectrum_reference from mapped PeptideIdentifications if pep.getScoreType() == 'Posterior Error Probability': row['PEP'] = h.getScore() else: row['PEP'] = "NA" row['SE'] = SE if h.metaValueExists(sesmv): row['SEscore'] = h.getMetaValue(sesmv) if st == 'q-value': row['qvalue'] = h.getScore() else: if h.metaValueExists('q-value_score'): row['qvalue'] = h.getMetaValue('q-value_score') if h.metaValueExists('binder'): if h.metaValueExists('weak'): row['binder'] = 'weak' elif h.metaValueExists('strong'): row['binder'] = 'strong' else: row['binder'] = 'yes' row['allele'] = h.getMetaValue("binder") else: row['binder'] = 'no' if "decoy" not in h.getMetaValue("target_decoy"): row['target'] = 'target' else: row['target'] = 'decoy' if options.doi: row['organ'] = organ row['replicate'] = repli row['patient'] = patient sepe.append(row) outframe = pd.DataFrame(sepe) if options.qcml: idxa = oms.Attachment() idxa.name = "generic table" #"extended id tab" idxa.cvRef = "qcML" idxa.cvAcc = "QC:0000049" idxa.qualityRef = "QC:0000025" idxa.colTypes = list(outframe.columns) rows = [[str(row[f]) for f in idxa.colTypes] for ind, row in outframe.iterrows() ] #writes nan if cell not filled idxa.tableRows = rows ls = list() qf.getRunNames(ls) qf.addRunAttachment(ls[0], idxa) #qf.store(oms.String(options.out.replace('%', 'perc'))) qf.store(oms.String(options.out)) else: outframe.to_csv(options.out, sep='\t', index=False)