def loadFileIdXML(self, file_path): prot_ids = [] pep_ids = [] pyopenms.IdXMLFile().load(file_path, prot_ids, pep_ids) Ions = {} # extract ID data from file for peptide_id in pep_ids: pep_mz = peptide_id.getMZ() pep_rt = peptide_id.getRT() for hit in peptide_id.getHits(): pep_seq = str(hit.getSequence().toString()) if "." in pep_seq: pep_seq = pep_seq[3:-1] else: pep_seq = pep_seq[2:-1] for anno in hit.getPeakAnnotations(): ion_charge = anno.charge ion_mz = anno.mz ion_label = anno.annotation Ions[ion_label] = [ion_mz, ion_charge] self.scanIDDict[round(pep_rt, 3)] = { 'm/z': pep_mz, 'PepSeq': pep_seq, 'PepIons': Ions } Ions = {} self.saveIdData()
def _read_psms_idxml(filename: str) -> pd.DataFrame: """ Read idXML spectrum identifications. Parameters ---------- filename : str The idXML file name. Returns ------- pd.DataFrame A Pandas DataFrame with as rows the PSMs and as columns "sequence" and "score", indexed by their spectrum reference in the form of {filename}:scan:{scan}. """ protein_ids, psms, scans, sequences, scores = [], [], [], [], [] pyopenms.IdXMLFile().load(filename, protein_ids, psms) peak_filename = os.path.splitext(os.path.basename( protein_ids[0].getMetaValue('spectra_data')[0].decode()))[0] for psm in tqdm.tqdm(psms, desc='PSMs read', unit='PSMs'): spectrum_index = psm.getMetaValue('spectrum_reference').decode() scans.append( int(spectrum_index[spectrum_index.find('scan=') + len('scan='):])) sequences.append(psm.getHits()[0].getSequence().toString().decode()) scores.append(psm.getHits()[0].getScore()) psms = pd.DataFrame({'filename': peak_filename, 'scan': scans, 'sequence': sequences, 'score': scores}) psms['spectra_ref'] = (psms['filename'] + ':scan:' + psms['scan'].astype(str)) return (psms[['spectra_ref', 'sequence', 'score']] .drop_duplicates('spectra_ref') .set_index('spectra_ref'))
def test_readfile_content(self): idxml_file = pyopenms.IdXMLFile() peps = [] prots = [] idxml_file.load(self.filename, prots, peps) self.assertEqual(len(prots), 1) self.assertEqual(len(peps), 3)
def test_readfile_content(self): idxml_file = pyopenms.IdXMLFile() peps = [] prots = [] idxml_file.load(self.filename, prots, peps) assert len(prots) == 1 assert len(peps) == 3
def id_mapper(in_file, id_file, out_file, params, use_centroid_rt, use_centroid_mz, use_subelements): in_type = pms.FileHandler.getType(in_file) protein_ids = [] peptide_ids = [] pms.IdXMLFile().load(id_file, protein_ids, peptide_ids) mapper = pms.IDMapper() mapper.setParameters(params) if in_type == pms.Type.CONSENSUSXML: file_ = pms.ConsensusXMLFile() map_ = pms.ConsensusMap() file_.load(in_file, map_) mapper.annotate(map_, peptide_ids, protein_ids, use_subelements) addDataProcessing( map_, params, pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING) file_.store(out_file, map_) elif in_type == pms.Type.FEATUREXML: file_ = pms.FeatureXMLFile() map_ = pms.FeatureMap() file_.load(in_file, map_) mapper.annotate(map_, peptide_ids, protein_ids, use_centroid_rt, use_centroid_mz) addDataProcessing( map_, params, pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING) file_.store(out_file, map_) elif in_type == pms.Type.MZQ: file_ = pms.MzQuantMLFile() msq = pms.MSQuantifications() file_.load(in_file, msq) maps = msq.getConsensusMaps() for map_ in maps: mapper.annotate(map_, peptide_ids, protein_ids, use_subelements) addDataProcessing( map_, params, pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING) msq.setConsensusMaps(maps) file_.store(out_file, msq) else: raise Exception("invalid input file format")
def full(ctx, filename, mzid=None, idxml=None): """Calculate all possible metrics for these files. These data sources will be included in set metrics.""" exp = oms.MSExperiment() oms.MzMLFile().load(click.format_filename(filename), exp) rq = basicqc.getBasicQuality(exp) if idxml and mzid: logging.warn( "Sorry, you can only give one id file. Please choose one.") click.echo(ctx.get_help()) return elif not idxml and not mzid: logging.warn("Sorry, you must give one id file in this mode.") click.echo(ctx.get_help()) return ms2num = 0 for x in rq.qualityMetrics: if x.name == "Number of MS2 spectra": ms2num = x.value if ms2num < 1: logging.warn( "We seem to have found no MS2 spectra which is unlikely to be true since you have also given some identifications. \ We continue with symbolic value of 1 for the number of MS2 spectra, \ however this means some metrics will invariably be incorrect!\ Please make sure, we have the right inputs.") ms2num = 1 pros = list() peps = list() if mzid: oms_id = oms.MzIdentMLFile() idf = mzid if idxml: oms_id = oms.IdXMLFile() idf = idxml if idf: oms_id.load(click.format_filename(idf), pros, peps) rq.qualityMetrics.extend(idqc.getIDQuality(exp, pros, peps, ms2num)) rqs.append(rq) finale()
def parse_idxml(self): peptides = [] proteins = [] scores = {} parsed_peptides = [] po.IdXMLFile().load(self.idxml_file, proteins, peptides) for p in peptides: #search engine scores scores["var_MS:1002252_Comet:XCorr"] = float(p.getHits()[0].getMetaValue('MS:1002252')) scores["var_MS:1002253_Comet:DeltCn"] = float(p.getHits()[0].getMetaValue('MS:1002253')) #percolator probability scores["q_value"] = float(p.getHits()[0].getMetaValue('MS:1001491')) scores["pep"] = float(p.getHits()[0].getMetaValue('MS:1001491')) parsed_peptides.append({**{'run_id': self.base_name, 'scan_id': int(str(p.getMetaValue("spectrum_reference")).split('scan=')[-1].strip("'")), 'hit_rank': int(p.getHits()[0].getRank()), 'massdiff': float(0), 'precursor_charge': int(p.getHits()[0].getCharge()), 'retention_time': float(p.getRT()), 'modified_peptide': p.getHits()[0].getSequence().toUniModString().decode("utf-8"), 'peptide_sequence': p.getHits()[0].getSequence().toUnmodifiedString().decode("utf-8"), 'modifications': '-', 'nterm_modification': '-', 'cterm_modification': '-', 'protein_id': ','.join([prot.getProteinAccession().decode("utf-8") for prot in p.getHits()[0].getPeptideEvidences()]), 'gene_id': '-', 'num_tot_proteins': len([prot.getProteinAccession() for prot in p.getHits()[0].getPeptideEvidences()]), 'decoy': p.getHits()[0].getMetaValue('target_decoy').decode("utf-8")=='decoy'}, **scores}) df = pd.DataFrame(parsed_peptides) return (df)
def test_readfile(self): idxml_file = pyopenms.IdXMLFile() peps = [] prots = [] idxml_file.load(self.filename, prots, peps)
skiplines = 0 with open(args.mztab) as f_in: line = next(f_in) while line.split('\t', 1)[0] != 'PSH': if 'ms_run[1]-location' in line: run_name = line.split('\t')[2] line = next(f_in) skiplines += 1 psms = pd.read_csv(args.mztab, sep='\t', header=skiplines, index_col='PSM_ID') peptide_ids = [] for _, psm in psms.iterrows(): peptide_id = pyms.PeptideIdentification() peptide_id.setRT(psm['retention_time']) peptide_id.setMZ(psm['exp_mass_to_charge']) peptide_id.setScoreType('q-value') peptide_id.setHigherScoreBetter(False) peptide_id.setIdentifier(run_name) peptide_hit = pyms.PeptideHit() peptide_hit.setScore(psm['search_engine_score[2]']) peptide_hit.setRank(1) peptide_hit.setCharge(psm['charge']) peptide_hit.setSequence(pyms.AASequence.fromString(psm['sequence'])) peptide_id.setHits([peptide_hit]) peptide_ids.append(peptide_id) protein_id = pyms.ProteinIdentification() protein_id.setIdentifier(run_name) pyms.IdXMLFile().store(output_path, [protein_id], peptide_ids)
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-in', dest="inf", help='<Required> full path to the input idXML', required=True) parser.add_argument('-out', dest="out", help="<Required> full path to the csv to be written", required=True) parser.add_argument( '-doi', '--dept_of_immuno-filenames', dest='doi', action='store_true', help= "If filename conforms with the filename scheme of the dept. of immunology... (in doubt, it probably doesn'T)" ) parser.set_defaults(doi=False) parser.add_argument( '-qcML', dest="qcml", action='store_true', help= "If instead of writing a csv embedding in a qcml file - prerequisite: -out must be a existing qcml file" ) parser.set_defaults(qcml=False) options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if not (options.inf or options.out): parser.print_help() sys.exit(1) pros = list() peps = list() f = oms.IdXMLFile() f.load(options.inf, pros, peps) if options.qcml: try: qf = oms.QcMLFile() qf.load(oms.String(options.out)) except: print "no usable qcml file given" sys.exit(1) SE = "NA" SE = pros[0].getSearchEngine() sesmv = "NA" if SE == 'MS-GF+': sesmv = 'MS:1002053' elif SE == 'XTandem': sesmv = 'E-Value' elif SE == 'Comet': sesmv = 'MS:1002257' elif SE == 'Mascot': sesmv = 'EValue' if not sesmv: print "no adequate search engine score found" #sys.exit(1) organ, repli, patient = None, None, None if options.doi: organ = path.basename(options.inf).split('_')[3] repli = path.basename(options.inf).split('_')[6].split('#')[-1] patient = path.basename(options.inf).split('_')[2] if not organ or not repli or not patient: options.doi = False sepe = list() for pep in peps: pep.sort() hits = pep.getHits() st = pep.getScoreType() if st != 'Posterior Error Probability': print "Warning, no PEP as score type found!" if pep.metaValueExists('spectrum_reference'): sn_ref = pep.getMetaValue('spectrum_reference') sn = sn_ref.split('=')[-1] else: sn = '?' sn_ref = '?' for i, h in enumerate(hits): row = dict() row['file'] = path.basename(options.inf) row['sequence'] = h.getSequence().toUnmodifiedString() row['modified'] = h.getSequence().isModified() row['rank'] = i + 1 row['spectrum'] = sn row['spectrum_reference'] = sn_ref # match with spectrum_native_id von featureXML and spectrum_reference from mapped PeptideIdentifications if pep.getScoreType() == 'Posterior Error Probability': row['PEP'] = h.getScore() else: row['PEP'] = "NA" row['SE'] = SE if h.metaValueExists(sesmv): row['SEscore'] = h.getMetaValue(sesmv) if st == 'q-value': row['qvalue'] = h.getScore() else: if h.metaValueExists('q-value_score'): row['qvalue'] = h.getMetaValue('q-value_score') if h.metaValueExists('binder'): if h.metaValueExists('weak'): row['binder'] = 'weak' elif h.metaValueExists('strong'): row['binder'] = 'strong' else: row['binder'] = 'yes' row['allele'] = h.getMetaValue("binder") else: row['binder'] = 'no' if "decoy" not in h.getMetaValue("target_decoy"): row['target'] = 'target' else: row['target'] = 'decoy' if options.doi: row['organ'] = organ row['replicate'] = repli row['patient'] = patient sepe.append(row) outframe = pd.DataFrame(sepe) if options.qcml: idxa = oms.Attachment() idxa.name = "generic table" #"extended id tab" idxa.cvRef = "qcML" idxa.cvAcc = "QC:0000049" idxa.qualityRef = "QC:0000025" idxa.colTypes = list(outframe.columns) rows = [[str(row[f]) for f in idxa.colTypes] for ind, row in outframe.iterrows() ] #writes nan if cell not filled idxa.tableRows = rows ls = list() qf.getRunNames(ls) qf.addRunAttachment(ls[0], idxa) #qf.store(oms.String(options.out.replace('%', 'perc'))) qf.store(oms.String(options.out)) else: outframe.to_csv(options.out, sep='\t', index=False)
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-c', dest="mhcclass", help='<Required> MHC class', required=True) parser.add_argument('-in', dest="inf", help='<Required> full path to the input file', required=True) parser.add_argument('-out', dest="out", help="<Required> full path to the output file", required=True) parser.add_argument( '-allele', dest="allele", help= "<Required> full path to an allele file, if 'in', allele file will be deduced from in file name", required=True) parser.add_argument( '-dirallele', dest="dirallele", help= "for use with '-allele in', describes full base path to the allele files" ) options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if not (options.inf or options.out or options.allele): parser.print_help() sys.exit(1) target_alleles_set = set() #Fred2.FileReader.read_lines is broken #alleles = FileReader.read_lines(options.allele, type=Allele) if options.allele == "in" and options.dirallele: if "_W_" not in options.inf: print "No class 1 type run detected." sys.exit(0) af = None for sp in options.inf.split("_"): if sp.startswith("BD"): af = join(options.dirallele, sp.split("-")[1] + ".allele") with open(af, 'r') as handle: for line in handle: target_alleles_set.add(Allele(line.strip().upper())) else: with open(options.allele, 'r') as handle: for line in handle: target_alleles_set.add(Allele(line.strip().upper())) if not target_alleles_set: parser.print_help() sys.exit(1) if options.mhcclass == "I": ttn = EpitopePredictorFactory('netmhcpan', version='3.0') lowerBound = 8 upperBound = 12 elif options.mhcclass == "II": ttn = EpitopePredictorFactory('netmhcIIpan', version='3.1') lowerBound = 15 upperBound = 25 pros = list() peps = list() f = oms.IdXMLFile() f.load(options.inf, pros, peps) pepstr = set() for pep in peps: for h in pep.getHits(): #if "decoy" not in h.getMetaValue("target_decoy"): unmod = h.getSequence().toUnmodifiedString() if lowerBound <= len(unmod) <= upperBound \ and 'U' not in unmod and 'B' not in unmod and 'X' not in unmod and 'Z' not in unmod: pepstr.add(h.getSequence().toUnmodifiedString()) es = [Peptide(x) for x in pepstr] try: preds_n = ttn.predict(es, alleles=target_alleles_set) except Exception as e: print "something went wrong with the netMHC prediction", options.inf, "what:", str( e) sys.exit(1) #only max preds = dict() for index, row in preds_n.iterrows(): score = row.max() #bigger_is_better allele = str(row.idxmax()) categ = categorize(score) seq = row.name[0].tostring() if categ: preds[seq] = (allele, categ, score) npeps = list() for pep in peps: hits = pep.getHits() nhits = list() for h in hits: if h.getSequence().toUnmodifiedString() in preds: x = preds[h.getSequence().toUnmodifiedString()] h.setMetaValue('binder', x[0]) h.setMetaValue(str(x[1]), x[2]) nhits.append(h) else: nhits.append(h) pep.setHits(nhits) f.store(options.out, pros, peps)