Exemple #1
0
    def test_encodeNP_SLOF(self):
        """
          String out;

          MSNumpressCoder::NumpressConfig config;
          config.np_compression = MSNumpressCoder::SLOF;
          config.estimate_fixed_point = true; // critical

          bool zlib_compression = false;
          MSNumpressCoder().encodeNP(in, out, zlib_compression, config);

          TEST_EQUAL(out.size(), 24)
          TEST_EQUAL(out, "QMVagAAAAAAZxX3ivPP8/w==")

        """
        coder = pyopenms.MSNumpressCoder()
        config = pyopenms.NumpressConfig()
        config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.SLOF
        config.estimate_fixed_point = True

        out_ = pyopenms.String()
        coder.encodeNP(self.testData, out_, False, config)
        out = out_.c_str()

        self.assertEqual(len(out), 24)
        self.assertEqual(out, "QMVagAAAAAAZxX3ivPP8/w==")
Exemple #2
0
def psm_df_mgf(input_map, theoretical, max_delta_ppm, scan_id,
               modified_peptide, precursor_charge):
    ionseries = theoretical[modified_peptide][precursor_charge]

    spectrum = input_map[scan_id]

    top_delta = 30
    ions, ion_masses = ionseries

    mzs0, intensities0 = spectrum[:, 0], spectrum[:, 1]
    ppms = np.abs((mzs0[:, np.newaxis] - ion_masses) / ion_masses * 1e6)
    idx_mask = (ppms < min(max_delta_ppm, top_delta)).any(1)
    idx = ppms[idx_mask].argmin(1)
    fragments, product_mzs, intensities = ions[idx], ion_masses[
        idx], intensities0[idx_mask]

    # Baseline normalization to highest annotated peak
    max_intensity = np.amax(intensities, initial=0.0)
    if max_intensity > 0:
        intensities /= max_intensity
        intensities *= 10000
    return [
        len(fragments), fragments, product_mzs, intensities, scan_id,
        po.AASequence.fromString(po.String(modified_peptide)).getMonoWeight(
            po.Residue.ResidueType.Full, precursor_charge) / precursor_charge,
        modified_peptide, precursor_charge
    ]
Exemple #3
0
def testParamEntry():
    # as ParamEntry::isValid takes "String &" as input argument, which
    # can not be implemened by a Python string, here no automatic
    # conversion from a basestring should happen:
    p = pyopenms.ParamEntry()
    message = pyopenms.String()
    assert p.isValid(message)
    assert message.c_str() == b""
Exemple #4
0
def read_tims_mgf(tims_mgf_path, psms, theoretical, max_delta_ppm):
	# read MGF
	import mmap
	record_pattern = re.compile(b'''BEGIN IONS\r?
(.*?)
END IONS''', re.MULTILINE | re.DOTALL)
	scan_num_pattern = re.compile(b'TITLE=Cmpd\s+([0-9]+),')
	peaks_pattern = re.compile(b'^([\\d.]+)\s+([\\d.]+)', re.MULTILINE)

	tims_data = {}
	with open(tims_mgf_path, "rb") as f:
		mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
		for e in record_pattern.finditer(mm):
			rec = e.group(1)
			scan_num_findall = scan_num_pattern.findall(rec)
			if len(scan_num_findall) == 1:
				scan_num = int(scan_num_findall[0])
			else:
				raise RuntimeError("Cannot find Cmpd number from " + rec)
			tims_data[scan_num] = np.array(peaks_pattern.findall(rec), dtype=float)

	peaks_list = []
	for scan_id, modified_peptide, precursor_charge in psms.itertuples(index=False):
		ionseries = theoretical[modified_peptide][precursor_charge]

		mz_intensity_array = tims_data[scan_id]

		fragments = []
		product_mzs = []
		intensities = []
		for mz, intensity in mz_intensity_array:
			fragment, product_mz = annotate_mass(mz, ionseries, max_delta_ppm)
			if fragment is not None:
				fragments.append(fragment)
				product_mzs.append(product_mz)
				intensities.append(intensity)

		peaks = pd.DataFrame({'fragment': fragments, 'product_mz': product_mzs, 'intensity': intensities})
		peaks['scan_id'] = scan_id
		peaks['precursor_mz'] = po.AASequence.fromString(po.String(modified_peptide)).getMonoWeight(po.Residue.ResidueType.Full, precursor_charge) / precursor_charge;
		peaks['modified_peptide'] = modified_peptide
		peaks['precursor_charge'] = precursor_charge

		# Baseline normalization to highest annotated peak
		peaks['intensity'] = peaks['intensity'] * (10000 / np.max(peaks['intensity']))

		peaks_list.append(peaks)

	if len(peaks_list) > 0:
		transitions = pd.concat(peaks_list)
		# Multiple peaks might be identically annotated, only use most intense
		transitions = transitions.groupby(['scan_id','modified_peptide','precursor_charge','precursor_mz','fragment','product_mz'])['intensity'].max().reset_index()
	else:
		transitions = pd.DataFrame({'scan_id': [], 'modified_peptide': [], 'precursor_charge': [], 'precursor_mz': [], 'fragment': [], 'product_mz': [], 'intensity': []})
	return transitions
Exemple #5
0
    def test_encodeNP_LINEAR(self):
        coder = pyopenms.MSNumpressCoder()
        config = pyopenms.NumpressConfig()
        config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.LINEAR
        config.estimate_fixed_point = True

        out_ = pyopenms.String()
        coder.encodeNP(self.testData, out_, False, config)
        out = out_.c_str()

        self.assertEqual(len(out), 28)
        self.assertEqual(out, "QWR64UAAAADo//8/0P//f1kSgA==")
Exemple #6
0
    def test_encodeNP_PIC(self):
        coder = pyopenms.MSNumpressCoder()
        config = pyopenms.NumpressConfig()
        config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.PIC
        config.estimate_fixed_point = True

        out_ = pyopenms.String()
        coder.encodeNP(self.testData, out_, False, config)
        out = out_.c_str()

        self.assertEqual(len(out), 12)
        self.assertEqual(out, "ZGaMXCFQkQ==")
Exemple #7
0
    def test_encodeNP_PIC(self):
        coder = pyopenms.MSNumpressCoder()
        config = pyopenms.NumpressConfig()
        config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.PIC
        config.estimate_fixed_point = True

        out_ = pyopenms.String()
        coder.encodeNPRaw(self.testData, out_, config)
        out = out_.c_str()

        self.assertEqual(len(out), 7)
        self.assertEqual(out, b'df\x8c\\!P\x91')
Exemple #8
0
    def test_encodeNP_LINEAR(self):
        coder = pyopenms.MSNumpressCoder()
        config = pyopenms.NumpressConfig()
        config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.LINEAR
        config.estimate_fixed_point = True

        out_ = pyopenms.String()
        coder.encodeNPRaw(self.testData, out_, config)
        out = out_.c_str()

        self.assertEqual(len(out), 19)
        self.assertEqual(
            out, b'Adz\xe1@\x00\x00\x00\xe8\xff\xff?\xd0\xff\xff\x7fY\x12\x80')
Exemple #9
0
    def test_encodeNP_SLOF(self):
        coder = pyopenms.MSNumpressCoder()
        config = pyopenms.NumpressConfig()
        config.np_compression = pyopenms.MSNumpressCoder.NumpressCompression.SLOF
        config.estimate_fixed_point = True

        out_ = pyopenms.String()
        coder.encodeNPRaw(self.testData, out_, config)
        out = out_.c_str()

        self.assertEqual(len(out), 16)
        self.assertEqual(
            out, b'@\xc5Z\x80\x00\x00\x00\x00\x19\xc5}\xe2\xbc\xf3\xfc\xff')
Exemple #10
0
def generate_ionseries(peptide_sequence, precursor_charge, fragment_charges=[1,2,3,4], fragment_types=['b','y'], enable_specific_losses = False, enable_unspecific_losses = False):
	peptide = po.AASequence.fromString(po.String(peptide_sequence))
	sequence = peptide.toUnmodifiedString()

	unspecific_losses = ["H2O1","H3N1","C1H2N2","C1H2N1O1"]

	fragments = {}

	for fragment_type in fragment_types:
		for fragment_charge in fragment_charges:
			if fragment_charge <= precursor_charge:
				for fragment_ordinal in range(1,len(sequence)):
					if fragment_type == 'a':
						ion = peptide.getPrefix(fragment_ordinal)
						mass = ion.getMonoWeight(po.Residue.ResidueType.AIon, fragment_charge) / fragment_charge;
					elif fragment_type == 'b':
						ion = peptide.getPrefix(fragment_ordinal)
						mass = ion.getMonoWeight(po.Residue.ResidueType.BIon, fragment_charge) / fragment_charge;
					elif fragment_type == 'c':
						ion = peptide.getPrefix(fragment_ordinal)
						mass = ion.getMonoWeight(po.Residue.ResidueType.CIon, fragment_charge) / fragment_charge;
					elif fragment_type == 'x':
						ion = peptide.getSuffix(fragment_ordinal)
						mass = ion.getMonoWeight(po.Residue.ResidueType.XIon, fragment_charge) / fragment_charge;
					elif fragment_type == 'y':
						ion = peptide.getSuffix(fragment_ordinal)
						mass = ion.getMonoWeight(po.Residue.ResidueType.YIon, fragment_charge) / fragment_charge;
					elif fragment_type == 'z':
						ion = peptide.getSuffix(fragment_ordinal)
						mass = ion.getMonoWeight(po.Residue.ResidueType.ZIon, fragment_charge) / fragment_charge;

					# Standard fragment ions
					fragments[fragment_type + str(fragment_ordinal) + "^" + str(fragment_charge)] = mass

					# Losses
					if enable_specific_losses or enable_unspecific_losses:
						for lossfragment_ordinal in range(1,ion.size()):
							if (ion.getResidue(lossfragment_ordinal).hasNeutralLoss()):
								losses = ion.getResidue(lossfragment_ordinal).getLossFormulas()
								for loss in losses:
									loss_type = loss.toString().decode("utf-8")

									if (enable_specific_losses and loss_type not in unspecific_losses) or (enable_unspecific_losses and loss_type in unspecific_losses):
										fragments[fragment_type + str(fragment_ordinal) + "-" + loss_type + "^" + str(fragment_charge)] = mass - (loss.getMonoWeight() / fragment_charge)

	return list(fragments.keys()), np.fromiter(fragments.values(), np.float, len(fragments))
Exemple #11
0
def read_mzml_or_mzxml_impl(path, psms, theoretical, max_delta_ppm, filetype):
	assert filetype in ('mzml', 'mzxml')
	fh = po.MzMLFile() if filetype=='mzml' else po.MzXMLFile()
	fh.setLogType(po.LogType.CMD)
	input_map = po.MSExperiment()
	fh.load(path, input_map)

	peaks_list = []
	for ix, psm in psms.iterrows():
		scan_id = psm['scan_id']
		ionseries = theoretical[psm['modified_peptide']][psm['precursor_charge']]

		spectrum = input_map.getSpectrum(scan_id - 1)

		fragments = []
		product_mzs = []
		intensities = []
		for peak in spectrum:
			fragment, product_mz = annotate_mass(peak.getMZ(), ionseries, max_delta_ppm)
			if fragment is not None:
				fragments.append(fragment)
				product_mzs.append(product_mz)
				intensities.append(peak.getIntensity())

		peaks = pd.DataFrame({'fragment': fragments, 'product_mz': product_mzs, 'intensity': intensities})
		peaks['scan_id'] = scan_id
		peaks['precursor_mz'] = po.AASequence.fromString(po.String(psm['modified_peptide'])).getMonoWeight(po.Residue.ResidueType.Full, psm['precursor_charge']) / psm['precursor_charge'];
		peaks['modified_peptide'] = psm['modified_peptide']
		peaks['precursor_charge'] = psm['precursor_charge']

		# Baseline normalization to highest annotated peak
		max_intensity = np.max(peaks['intensity'])
		if max_intensity > 0:
			peaks['intensity'] = peaks['intensity'] * (10000 / max_intensity)

		peaks_list.append(peaks)

	if len(peaks_list) > 0:
		transitions = pd.concat(peaks_list)
		# Multiple peaks might be identically annotated, only use most intense
		transitions = transitions.groupby(['scan_id','modified_peptide','precursor_charge','precursor_mz','fragment','product_mz'])['intensity'].max().reset_index()
	else:
		transitions = pd.DataFrame({'scan_id': [], 'modified_peptide': [], 'precursor_charge': [], 'precursor_mz': [], 'fragment': [], 'product_mz': [], 'intensity': []})
	return(transitions)
Exemple #12
0
def psm_df(input_map, theoretical, max_delta_ppm, scan_id, modified_peptide,
           precursor_charge):
    ionseries = theoretical[modified_peptide][precursor_charge]

    spectrum = input_map.getSpectrum(scan_id - 1)

    fragments, product_mzs, intensities = annotate_mass_spectrum(
        ionseries, max_delta_ppm, spectrum)
    # Baseline normalization to highest annotated peak
    max_intensity = np.amax(intensities, initial=0.0)
    if max_intensity > 0:
        intensities /= max_intensity
        intensities *= 10000
    return [
        len(fragments), fragments, product_mzs, intensities, scan_id,
        po.AASequence.fromString(po.String(modified_peptide)).getMonoWeight(
            po.Residue.ResidueType.Full, precursor_charge) / precursor_charge,
        modified_peptide, precursor_charge
    ]
Exemple #13
0
def main(options):

    # make sure that the ids are "correct" for the testcase
    date_time = pyopenms.DateTime()
    if options.test:
        date_time.set("1999-12-31 23:59:59")
        pyopenms.UniqueIdGenerator().setSeed(date_time)
    else:
        date_time = pyopenms.DateTime.now()

    exp = pyopenms.MSExperiment()
    out_map = pyopenms.ConsensusMap()
    pyopenms.FileHandler().loadExperiment(options.infile, exp)
    exp.updateRanges()

    #
    # 1. filter MS1 level (only keep MS1)
    #
    tmp = copy.copy(exp)
    tmp.clear(False)
    for spectrum in exp:
        if spectrum.getMSLevel() == 1:
            tmp.push_back(spectrum)
    exp = tmp
    exp.sortSpectra(True)

    #
    # 2. set parameters
    #
    analyzer = pyopenms.SILACAnalyzer()
    analyzer.initialize(
        # section sample
        options.selected_labels,
        options.charge_min,
        options.charge_max,
        options.missed_cleavages,
        options.isotopes_per_peptide_min,
        options.isotopes_per_peptide_max,
        # section "algorithm"
        options.rt_threshold,
        options.rt_min,
        options.intensity_cutoff,
        options.intensity_correlation,
        options.model_deviation,
        options.allow_missing_peaks,
        # labels
        options.label_identifiers)

    #
    # 3. run
    #
    analyzer.run_all(exp, out_map)

    #
    # 4. set dataprocessing and output meta information
    #
    out_map.sortByPosition()

    dp = out_map.getDataProcessing()
    p = pyopenms.DataProcessing()
    p.setProcessingActions(
        set([
            pyopenms.ProcessingAction().DATA_PROCESSING,
            pyopenms.ProcessingAction().PEAK_PICKING,
            pyopenms.ProcessingAction().FILTERING,
            pyopenms.ProcessingAction().QUANTITATION
        ]))
    p.setCompletionTime(date_time)

    sw = p.getSoftware()
    sw.setName("SILACAnalyzer")
    if options.test:
        sw.setVersion("version_string")
        p.setSoftware(sw)
        p.setMetaValue("parameter: mode", "test_mode")
    else:
        sw.setVersion("pyTOPP v1.10")
        p.setSoftware(sw)
    dp.append(p)
    out_map.setDataProcessing(dp)

    #
    # 5. write output
    #
    analyzer.writeConsensus(pyopenms.String(options.outfile), out_map)
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-in',
                        dest="inf",
                        help='<Required> full path to the input idXML',
                        required=True)
    parser.add_argument('-out',
                        dest="out",
                        help="<Required> full path to the csv to be written",
                        required=True)
    parser.add_argument(
        '-doi',
        '--dept_of_immuno-filenames',
        dest='doi',
        action='store_true',
        help=
        "If filename conforms with the filename scheme of the dept. of immunology... (in doubt, it probably doesn'T)"
    )
    parser.set_defaults(doi=False)
    parser.add_argument(
        '-qcML',
        dest="qcml",
        action='store_true',
        help=
        "If instead of writing a csv embedding in a qcml file - prerequisite: -out must be a existing qcml file"
    )
    parser.set_defaults(qcml=False)

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if not (options.inf or options.out):
        parser.print_help()
        sys.exit(1)

    pros = list()
    peps = list()
    f = oms.IdXMLFile()
    f.load(options.inf, pros, peps)

    if options.qcml:
        try:
            qf = oms.QcMLFile()
            qf.load(oms.String(options.out))
        except:
            print "no usable qcml file given"
            sys.exit(1)

    SE = "NA"
    SE = pros[0].getSearchEngine()
    sesmv = "NA"
    if SE == 'MS-GF+':
        sesmv = 'MS:1002053'
    elif SE == 'XTandem':
        sesmv = 'E-Value'
    elif SE == 'Comet':
        sesmv = 'MS:1002257'
    elif SE == 'Mascot':
        sesmv = 'EValue'
    if not sesmv:
        print "no adequate search engine score found"
        #sys.exit(1)

    organ, repli, patient = None, None, None
    if options.doi:
        organ = path.basename(options.inf).split('_')[3]
        repli = path.basename(options.inf).split('_')[6].split('#')[-1]
        patient = path.basename(options.inf).split('_')[2]
    if not organ or not repli or not patient:
        options.doi = False

    sepe = list()
    for pep in peps:
        pep.sort()
        hits = pep.getHits()
        st = pep.getScoreType()
        if st != 'Posterior Error Probability':
            print "Warning, no PEP as score type found!"
        if pep.metaValueExists('spectrum_reference'):
            sn_ref = pep.getMetaValue('spectrum_reference')
            sn = sn_ref.split('=')[-1]
        else:
            sn = '?'
            sn_ref = '?'
        for i, h in enumerate(hits):
            row = dict()
            row['file'] = path.basename(options.inf)
            row['sequence'] = h.getSequence().toUnmodifiedString()
            row['modified'] = h.getSequence().isModified()
            row['rank'] = i + 1
            row['spectrum'] = sn
            row['spectrum_reference'] = sn_ref  # match with spectrum_native_id von featureXML and spectrum_reference from mapped PeptideIdentifications
            if pep.getScoreType() == 'Posterior Error Probability':
                row['PEP'] = h.getScore()
            else:
                row['PEP'] = "NA"
            row['SE'] = SE
            if h.metaValueExists(sesmv):
                row['SEscore'] = h.getMetaValue(sesmv)
            if st == 'q-value':
                row['qvalue'] = h.getScore()
            else:
                if h.metaValueExists('q-value_score'):
                    row['qvalue'] = h.getMetaValue('q-value_score')
            if h.metaValueExists('binder'):
                if h.metaValueExists('weak'):
                    row['binder'] = 'weak'
                elif h.metaValueExists('strong'):
                    row['binder'] = 'strong'
                else:
                    row['binder'] = 'yes'
                row['allele'] = h.getMetaValue("binder")
            else:
                row['binder'] = 'no'
            if "decoy" not in h.getMetaValue("target_decoy"):
                row['target'] = 'target'
            else:
                row['target'] = 'decoy'
            if options.doi:
                row['organ'] = organ
                row['replicate'] = repli
                row['patient'] = patient
            sepe.append(row)

    outframe = pd.DataFrame(sepe)

    if options.qcml:
        idxa = oms.Attachment()
        idxa.name = "generic table"  #"extended id tab"
        idxa.cvRef = "qcML"
        idxa.cvAcc = "QC:0000049"
        idxa.qualityRef = "QC:0000025"

        idxa.colTypes = list(outframe.columns)
        rows = [[str(row[f]) for f in idxa.colTypes]
                for ind, row in outframe.iterrows()
                ]  #writes nan if cell not filled
        idxa.tableRows = rows

        ls = list()
        qf.getRunNames(ls)
        qf.addRunAttachment(ls[0], idxa)
        #qf.store(oms.String(options.out.replace('%', 'perc')))
        qf.store(oms.String(options.out))

    else:
        outframe.to_csv(options.out, sep='\t', index=False)