def get_PSM_pepxml(psm_file): ''' :param psm_file: psm file :return: dictionairy of psms ''' PSM = [] PEP = pepxml.read(psm_file, read_schema=False, iterative=True) # count = 0 # parse tags out of protein IDs for row in PEP: # adjust search scores if 'search_hit' in row.keys(): for search_hit in row['search_hit']: search_hit['massdiff'] = str( search_hit['massdiff']) + ';' + str( search_hit['calc_neutral_pep_mass']) + ';' + str( row['precursor_neutral_mass']) search_hit['search_score'] = { 'score': _get_score_(search_hit['search_score']), 'evalue': _get_evalue_(search_hit['search_score']) } # if count==5500: # break PSM.append(row) # count+=1 del PEP return PSM
def iprophetpepxml_csv(infile, outfile): """ :param infile: input pepxml :param outfile: outcsv :return: """ # outfile = os.path.splitext(infile)[0] + '.csv' reader = pepxml.read(infile) f = open(outfile, 'wb') writer = csv.writer(f, delimiter='\t') # modifications_example = [{'position': 20, 'mass': 160.0306}] header_set = False nr_rows = 0 result = {} for hit in reader: if 'error_point' in hit: print (hit) #wenguang: remove all decoy hits! if 'search_hit' in hit and hit['search_hit'][0]['proteins'][0]['protein'].find("DECOY") == -1: #continue #else : # result = hit# #print(hit['search_hit'][0]['proteins'][0]['protein']) nr_rows +=1 result['retention_time_sec'] = hit['retention_time_sec'] result['assumed_charge'] = hit['assumed_charge'] result['spectrum'] = hit['spectrum'] result['nrhit'] = len(hit['search_hit']) search_hit = hit['search_hit'][0] result['modified_peptide'] = search_hit['modified_peptide'] result['search_hit'] = search_hit['peptide'] analysis_result = search_hit['analysis_result'][1] iprophet_probability = analysis_result['interprophet_result']['probability'] result['iprophet_probability'] = iprophet_probability result['protein_id'] = search_hit['proteins'][0]['protein'] result['nrproteins'] = len(search_hit['proteins']) if not header_set: writer.writerow(result.keys()) header_set = True writer.writerow(result.values()) print(nr_rows) f.close()
def digest_pepxml(self): with pepxml.read(self.path) as psms: for psm in psms: psm_result = [] if 'search_hit' in psm: psm_result.append(psm['spectrum']) psm_result.append(psm['search_hit'][0]['peptide']) psm_result.append(psm['search_hit'][0]['search_score']['ionscore']) psm_result.append(len(psm['search_hit'][0]['proteins'])) psm_result.append(psm['search_hit'][0]['num_matched_ions']) else: psm_result.append(psm['spectrum']) psm_result.append('') psm_result.append(0) psm_result.append(0) psm_result.append(0) self.result.append(psm_result)
def getFileFractionMappingFromPepXML(pepXMLFilepath): fileList = [] fileFractionMapping = [] with pepxml.read(pepXMLFilepath, read_schema=False) as reader: #auxiliary.print_tree(next(reader)) for currSpectrum in reader: if not currSpectrum['spectrum'] in fileList: fileList.append(currSpectrum['spectrum']) #print '** Added %s to list of data files' % (currSpectrum['spectrum']) sortedFileList = sorted(fileList) currFractionNum = 1 for currDataFile in sortedFileList: fileFractionMapping.append( tuple([str(currFractionNum).zfill(2), currDataFile])) currFractionNum += 1 return fileFractionMapping
def getInfoFromPepXml(self): count_good = 0 count_bad = 0 with pepxml.read(self.pathXml) as psms: for psm in psms: if 'search_hit' in psm: score = psm['search_hit'][0]['search_score']['ionscore'] if score > self.goodSpectraCriteria: count_good += 1 self.goodBadUgly['good'].append(psm['spectrum']) else: count_bad += 1 self.goodBadUgly['bad'].append(psm['spectrum']) else: count_bad += 1 self.goodBadUgly['bad'].append(psm['spectrum']) print("good: ", count_good) print("bad: ", count_bad)
def get_PSM_pepxml(psm_file): PSM = [] PEP = pepxml.read(psm_file, read_schema=False, iterative=True) # count = 0 # parse tags out of protein IDs for row in PEP: # adjust search scores if 'search_hit' in row.keys(): for search_hit in row['search_hit']: search_hit['search_score']={'score':_get_score_(search_hit['search_score']), 'evalue':_get_evalue_(search_hit['search_score'])} # if count==5500: # break PSM.append(row) # count+=1 del PEP return PSM
def iprophetpepxml_csv(infile, outfile): """ :param infile: input pepxml :param outfile: outcsv :return: """ # outfile = os.path.splitext(infile)[0] + '.csv' reader = pepxml.read(infile) f = open(outfile, "wb") writer = csv.writer(f, delimiter="\t") # modifications_example = [{'position': 20, 'mass': 160.0306}] header_set = False nr_rows = 0 result = {} for hit in reader: if "error_point" in hit: print(hit) if "search_hit" in hit: # continue # else : # result = hit nr_rows += 1 result["retention_time_sec"] = hit["retention_time_sec"] result["assumed_charge"] = hit["assumed_charge"] result["spectrum"] = hit["spectrum"] result["nrhit"] = len(hit["search_hit"]) search_hit = hit["search_hit"][0] result["modified_peptide"] = search_hit["modified_peptide"] result["search_hit"] = search_hit["peptide"] analysis_result = search_hit["analysis_result"][1] iprophet_probability = analysis_result["interprophet_result"]["probability"] result["iprophet_probability"] = iprophet_probability result["protein_id"] = search_hit["proteins"][0]["protein"] result["nrproteins"] = len(search_hit["proteins"]) if not header_set: writer.writerow(result.keys()) header_set = True writer.writerow(result.values()) print(nr_rows) f.close()
#Dec 8th 2014, Avinash Shanmugam #Script will parse a pepxml file and output as a tsv file import sys from pyteomics import pepxml, auxiliary if len(sys.argv) != 2: print "USAGE: python pepxmlparse.py <pepxmlfile>"; sys.exit(); pepxfile = sys.argv[1]; outfile = pepxfile.replace(".xml","xml.parse.tsv"); #Create pepxml reading iterator pepxitr = pepxml.read(pepxfile, read_schema=False); #Open outfile out = open(outfile,"w"); #Create and write title line to outfile titleLine = "\t".join(["peptide","modpeptide","specid","mass","charge", "iniprob","pprophprob","isfwd","protidString"]); out.write(titleLine+"\n"); lineNo = 0; #Iterate through the pepxml recoprds for pepxrec in pepxitr: #Extract needed vals from the dict returned
def read_psms(self): #### Set up information t0 = timeit.default_timer() stats = {'n_psms': 0} #### Show information if self.verbose >= 1: eprint(f"INFO: Reading pepXML file {self.pepxml_file}") progress_intro = False #### If the pepXML is gzipped, then open with zlib, else a plain open match = re.search('\.gz$', self.pepxml_file) if match: infile = gzip.open(self.pepxml_file) else: infile = open(self.pepxml_file, 'rb') #### Print a header print("\t".join([ 'scan', 'pool', 'PepProProb', 'iProProb', 'charge', 'PTMProProb', 'isSeqInDataset', 'isSeqInPool', 'IsPepformInPool', 'SameNoPhosphos', 'HasAPhospho', 'sequence', 'RefPepform', 'CalcPepform', 'PTMProProbsString', 'USI' ])) #### Read psms from the file with pepxml.read(infile) as reader: for psm in reader: peptideprophet_probability = None iprophet_probability = None keep = False mean_best_probability = -1 peptide_str = 'xx' pool = '?' msrun_name = '?' #print(psm) sequence = psm['search_hit'][0]['peptide'] charge = psm['assumed_charge'] spectrum_name = psm['spectrum'] match = re.search(r"_(pool\d)_", spectrum_name) if match: pool = match.group(1) match = re.match(r"(.+)\.\d+\.\d+\.\d+$", spectrum_name) if match: msrun_name = match.group(1) for analysis_result in psm['search_hit'][0]['analysis_result']: if analysis_result['analysis'] == 'peptideprophet': peptideprophet_probability = analysis_result[ 'peptideprophet_result']['probability'] if analysis_result['analysis'] == 'interprophet': iprophet_probability = analysis_result[ 'interprophet_result']['probability'] if analysis_result['analysis'] == 'ptmprophet': #print(analysis_result) #print(analysis_result['ptmprophet_result']['ptm'][0:3]) if analysis_result['ptmprophet_result']['ptm'][ 0:3] == 'STY': peptide_str = analysis_result['ptmprophet_result'][ 'ptm_peptide'] mean_best_probability = analysis_result[ 'ptmprophet_result']['parameter'][ 'mean_best_prob'] if iprophet_probability is not None and iprophet_probability >= 0.90: keep = True if mean_best_probability < 0.9: keep = False #### Generate a peptidoform in proper notation peptidoform = '??????' phospho_peptidoform = '??????' has_alanine_phospho = 'N' n_phosphos = 0 #print(psm['search_hit'][0]) if 'modifications' in psm['search_hit'][0]: #print(psm['search_hit'][0]['modifications']) residues = list(sequence) phospho_residues = list(sequence) nterm = '' for modification in psm['search_hit'][0]['modifications']: offset = modification['position'] # if 'variable' in modification: # if abs( modification['variable'] - 79.966 ) < 0.01: # residues[offset-1] += '[Phospho]' # elif abs( modification['variable'] - 15.9949 ) < 0.01: # residues[offset-1] += '[Hydroxylation]' # elif abs( modification['variable'] - 0.984 ) < 0.01: # residues[offset-1] += '[Deamidation]' # elif abs( modification['variable'] - (-17.026) ) < 0.01: # residues[offset-1] += '[Pyro-glu]' # elif abs( modification['variable'] - (-18.010) ) < 0.01: # residues[offset-1] += '[Pyro_glu]' # else: # print(f"ERROR: Unable to translate {modification}") if 'mass' in modification: #### Phospho-only peptidoforms if abs(modification['mass'] - 181.01401) < 0.01: phospho_residues[offset - 1] += '[Phospho]' n_phosphos += 1 elif abs(modification['mass'] - 243.0297) < 0.01: phospho_residues[offset - 1] += '[Phospho]' n_phosphos += 1 elif abs(modification['mass'] - 166.998359) < 0.01: phospho_residues[offset - 1] += '[Phospho]' n_phosphos += 1 elif abs(modification['mass'] - 151.003445) < 0.01: phospho_residues[offset - 1] += '[Phospho]' n_phosphos += 1 has_alanine_phospho = 'Y' #### All-mod peptidforms if abs(modification['mass'] - 160.030649) < 0.01: #residues[offset-1] += '[Carbamidomethyl]' pass elif abs(modification['mass'] - 181.01401) < 0.01: residues[offset - 1] += '[Phospho]' elif abs(modification['mass'] - 243.0297) < 0.01: residues[offset - 1] += '[Phospho]' elif abs(modification['mass'] - 166.998359) < 0.01: residues[offset - 1] += '[Phospho]' elif abs(modification['mass'] - 151.003445) < 0.01: residues[offset - 1] += '[Phospho]' elif abs(modification['mass'] - 147.0354) < 0.01: residues[offset - 1] += '[Hydroxylation]' elif abs(modification['mass'] - 202.074213) < 0.01: residues[offset - 1] += '[Hydroxylation]' elif abs(modification['mass'] - 113.047664) < 0.01: residues[offset - 1] += '[Hydroxylation]' elif abs(modification['mass'] - 115.026943) < 0.01: residues[offset - 1] += '[Deamidation]' elif abs(modification['mass'] - 129.042594) < 0.01: residues[offset - 1] += '[Deamidation]' elif abs(modification['mass'] - 111.032029) < 0.01: residues[offset - 1] += '[Pyro-glu]' elif abs(modification['mass'] - 111.032029) < 0.01: residues[offset - 1] += '[Pyro_glu]' elif abs(modification['mass'] - 143.0041) < 0.01: #residues[offset-1] += '[Carbamidomethyl]' residues[offset - 1] += '[Pyro_glu]' elif abs(modification['mass'] - 43.018425) < 0.01: nterm = '[Acetyl]-' else: print( f"ERROR: Unable to translate {modification}" ) peptidoform = nterm + ''.join(residues) phospho_peptidoform = ''.join(phospho_residues) #print(peptidoform) is_sequence_in_dataset = 'N' if sequence in self.reference_peptides['by_sequence']: is_sequence_in_dataset = 'Y' is_sequence_in_pool = 'N' pool_sequence = f"{pool}-{sequence}" if pool_sequence in self.reference_peptides[ 'by_pool_sequence']: is_sequence_in_pool = 'Y' same_number_of_phosmods = 'N' if pool_sequence in self.reference_peptides[ 'by_pool_sequence']: ref_n_mods = int( self.reference_peptides['by_pool_sequence'] [pool_sequence][0]['n_mods']) #eprint(f"{n_phosphos}, {ref_n_mods}, {type(n_phosphos)}, {type(ref_n_mods)}") same_number_of_phosmods = f"{n_phosphos},{ref_n_mods}" if n_phosphos == ref_n_mods: same_number_of_phosmods = 'Y' is_peptidoform_in_pool = 'N' pool_peptidoform = f"{pool}-{phospho_peptidoform}" if pool_peptidoform in self.reference_peptides[ 'by_pool_peptidoform']: is_peptidoform_in_pool = 'Y' #### Find the reference peptide reference_peptidoform = '---------------------' if is_sequence_in_pool == 'Y': reference_peptidoform = self.reference_peptides[ 'by_pool_sequence'][pool_sequence][0]['peptidoform'] if keep: usi = f"mzspec:PXD007058:{msrun_name}:scan:{psm['start_scan']}:{peptidoform}/{charge}" row = [ str(psm['start_scan']), pool, str(peptideprophet_probability), str(iprophet_probability), str(charge), str(mean_best_probability), is_sequence_in_dataset, is_sequence_in_pool, is_peptidoform_in_pool, same_number_of_phosmods, has_alanine_phospho, sequence, reference_peptidoform, peptidoform, peptide_str, usi ] print("\t".join(row)) #### Testing. Print the data structure of the first spectrum #if stats['n_psms'] >1000: #auxiliary.print_tree(psm) #sys.exit(10) #### Update counters and print progress stats['n_psms'] += 1 if self.verbose >= 1: if stats['n_psms'] / 1000 == int(stats['n_psms'] / 1000): if not progress_intro: eprint("INFO: Reading psms.. ", end='') progress_intro = True eprint(f"{stats['n_psms']}.. ", end='', flush=True) infile.close() if self.verbose >= 1: eprint("") #### Print final timing information t1 = timeit.default_timer() print(f"INFO: Read {stats['n_psms']} psms from {self.pepxml_file}") print(f"INFO: Elapsed time: {t1-t0}") print(f"INFO: Processed {stats['n_psms']/(t1-t0)} psms per second")
""" from __future__ import print_function import sys import os.path #from msproteomicstoolslib.format import pepXMLReader import csv csv.field_size_limit(sys.maxsize) from pyteomics import pepxml infile = sys.argv[1] outfile = os.path.splitext(infile)[0] + '.csv' reader = pepxml.read(infile) writer = csv.writer(open(outfile, 'w'), delimiter='\t') ## MYRIMATCH { 'end_scan': 1380, 'retention_time_sec': 5190.16999999998, 'index': 160, 'assumed_charge': 2, 'spectrum': '5P_HDMSE_121214_20.1380.1380.2', 'search_hit': [ { 'hit_rank': 1, 'calc_neutral_pep_mass': 3123.6467143833, 'modifications': [],
decrement = False try: csv.field_size_limit(maxInt) except OverflowError: maxInt = int(maxInt/10) decrement = True from pyteomics import pepxml infile = sys.argv[1] outfile = os.path.splitext(infile)[0] + '.csv' reader = pepxml.read(infile) writer = csv.writer(open(outfile, 'w'), delimiter='\t') ## MYRIMATCH { 'end_scan': 1380, 'retention_time_sec': 5190.16999999998, 'index': 160, 'assumed_charge': 2, 'spectrum': '5P_HDMSE_121214_20.1380.1380.2', 'search_hit': [ { 'hit_rank': 1, 'calc_neutral_pep_mass': 3123.6467143833, 'modifications': [],
def parse_xml_based_format_to_identification_table( path2XML_file: str, path2fastaDB: str, decoy_prefix: str = 'DECOY', is_idXML: bool = False, fasta_reader_param: Dict[str, str] = { 'filter_decoy': True, 'decoy_string': 'DECOY' }, remove_if_not_matched: bool = True) -> pd.DataFrame: """parse either a pepXML or an idXML file to generate an identification table , :param path2XML_file: The path to the input pepXML files :type path2XML_file: str :param path2fastaDB: The path to a fasta sequence database to obtain the protein sequences :type path2fastaDB: str :param decoy_prefix: the prefix of the decoy sequences, default is DECOY :type decoy_prefix: str, optional :param is_idXML: Whether or not the provided file is an idXML, default is false which assume the provided file is a pepXML file, defaults to False :type is_idXML: bool, optional :param fasta_reader_param: A dict of parameters for controlling the behavior of the fasta reader, defaults to {'filter_decoy':True, 'decoy_string':'DECOY' } :type fasta_reader_param: Dict[str,str], optional :param remove_if_not_matched: remove the peptide if it could not be matched to the parent protein, defaults to True :type remove_if_not_matched: bool, optional :raises IOError: if the fasta database could not be open :raises ValueError: if the XML file can not be open :raises KeyError: if a protein id defined in the mzTab file could not be extracted from a matched sequence database :raises ValueError: if the peptide can not be mapped to the identified protein :return: the identification table :rtype: pd.DataFrame """ # load the fasta files and extract the accession of the proteins try: sequence_dict: Dict[str, str] = fasta2dict(path2fastaDB, **fasta_reader_param) except Exception as exp: raise IOError( f'While parsing your input fasta file: {path2fastaDB}, the following error was encountered: {exp}' ) # parse that the file exists: if not os.path.exists(path2XML_file): raise ValueError(f'The provided path: {path2XML_file} does not exist!') # allocate a list to hold peptide and protein list peptides: List[str] = [] protein_acc: List[str] = [] # parse the XML file if is_idXML: with idxml.IDXML(path2XML_file) as reader: for elem in reader: for hit in elem['PeptideHit']: for prot in hit['protein']: if decoy_prefix not in prot['accession']: peptides.append(hit['sequence']) if '|' in prot['accession']: protein_acc.append( prot['accession'].split('|')[1]) else: protein_acc.append(prot['accession']) else: with pepxml.read(path2XML_file) as reader: for elem in reader: for hit in elem['search_hit']: for protein in hit['proteins']: if decoy_prefix not in protein['protein']: peptides.append(hit['peptide']) protein_acc.append( protein['protein'].split('|')[1]) # extract the start and end index of the peptides from the parent proteins start_index: List[int] = [] end_index: List[int] = [] # fill extract the start and end-index information from the library for idx in range(len(protein_acc)): # get the protein sequence try: prot_seq: str = sequence_dict[protein_acc[idx]] except KeyError as exp: raise KeyError( f'Database mismatch, the current protein accession: {protein_acc[idx]} is not defined in the provided sequence database' ) # get the index of the protein sequence try: if '(' in peptides[ idx]: # that is there sequence modifications in the sequence temp_peptide = peptides[ idx] # that is there sequence modifications in the sequence while '(' in temp_peptide or ')' in temp_peptide: pre_seq = temp_peptide.split('(')[0] post_seq = ")".join(temp_peptide.split(')')[1:]) temp_peptide = pre_seq + post_seq start_index.append(prot_seq.index(temp_peptide)) peptide_len = len(temp_peptide) else: start_index.append(prot_seq.index(peptides[idx])) peptide_len = len(peptides[idx]) except ValueError as exp: if remove_if_not_matched: start_index.append( -1) # add a placeholder value that will be dropped later else: raise ValueError( f'Peptide sequence: {peptides[idx]} could not be extracted from protein sequence: {prot_seq} with accession: {protein_acc[idx]}' ) # add the end index end_index.append(start_index[idx] + peptide_len) # build the data frame ident_table: pd.DataFrame = pd.DataFrame({ 'peptide': peptides, 'protein': protein_acc, 'start_index': start_index, 'end_index': end_index }) ident_table = ident_table.loc[ident_table.iloc[:, 2] != -1, :] # filter the non-matched peptides # return the results return ident_table
def convertPepxmlToCSV(pepXMLFilepath, csvOutputFilepath, fractionMapping=None): protonMass = 1.007276466771 with open(csvOutputFilepath, "w") as fout: with pepxml.read(pepXMLFilepath) as reader: #auxiliary.print_tree(next(reader)) columnNames = [] columnNames.append('Fraction') columnNames.append('Scan') columnNames.append('Source File') columnNames.append('Peptide') columnNames.append('Tag Length') columnNames.append('ALC (%)') columnNames.append('length') columnNames.append('m/z') columnNames.append('z') columnNames.append('RT') columnNames.append('Area') columnNames.append('Mass') columnNames.append('ppm') columnNames.append('PTM') columnNames.append('local confidence (%)') columnNames.append('tag (>=0%)') columnNames.append('mode') fout.write(','.join(columnNames) + '\n') for currSpectrum in reader: # Ok, lets gather the fields we need: # print currSpectrum['start_scan'] fileName = currSpectrum['spectrum'] fractionNum = 0 if not fractionMapping is None: for fractionFilePair in fractionMapping: if fractionFilePair[1] == fileName: fractionNum = int(fractionFilePair[0]) searchHit = currSpectrum['search_hit'] #print 'searchHit:' #print searchHit ''' search_hit [list] -> hit_rank -> calc_neutral_pep_mass -> modifications -> modified_peptide -> peptide -> massdiff -> search_score -> -> positional_conf -> -> PeaksDenovoScore -> num_tot_proteins -> proteins [list] -> -> num_tol_term -> -> protein -> -> protein_descr ''' searchScore = searchHit[0]['search_score'] #print 'searchScore:' #print searchScore scan = currSpectrum['start_scan'] #print scan PTM = '' peptide = searchHit[0]['peptide'] #print peptide if searchHit[0]['modified_peptide'] != peptide: peptide = searchHit[0]['modified_peptide'] PTM = 'Carbamidomethylation' # Remove anything between parentheses, to get rid of modifications from the peptide string # e.g. LLEGEEC(+57.02)R --> LLEGEECR strippedPeptide = re.sub(r'\([^)]*\)', '', peptide) tagLength = len(strippedPeptide) #print tagLength deNovoScore = int(100.0 * float(searchScore['PeaksDenovoScore'])) #print deNovoScore z = int(currSpectrum['assumed_charge']) #print z precursorMass = float(currSpectrum['precursor_neutral_mass']) calcMass = float(searchHit[0]['calc_neutral_pep_mass']) #print m #scanData['Obs M+H'] = float(scan['m/z']) * int(scan['z']) - ((int(scan['z']) - 1) * Constants.mods['H+']) mOverZ = (precursorMass / float(z)) + protonMass #print mOverZ ppm = searchHit[0]['massdiff'] RT = '0' # '73.49' Area = '6.92E5' # Confidence must be converted from "0.96,0.98,0.99,0.99,0.99,0.99,0.99" in pepXML to "96 98 99 99 99 99 99" for CSV localConfidenceString = '' localConfidenceList = [] positionalConfidenceString = searchScore['positional_conf'] positionalConfidenceList = positionalConfidenceString.split( ',') for currConfidenceDecimalScore in positionalConfidenceList: localConfidenceList.append( str(int(100.0 * float(currConfidenceDecimalScore)))) localConfidenceString = ' '.join(localConfidenceList) # Now lets spit out the entry fout.write(str(fractionNum) + ',') # Fraction fout.write(str(scan) + ',') # Scan fout.write(fileName + ',') # Source File fout.write(peptide + ',') # Peptide fout.write(str(tagLength) + ',') # Tag Length fout.write(str(deNovoScore) + ',') # ALC (%) fout.write(str(tagLength) + ',') # length fout.write(str(mOverZ) + ',') # m/z fout.write(str(z) + ',') # z fout.write( RT + ',' ) # RT is currently set to a bogus constant 0, as the information is not in the pepXML fout.write('' + ',') # Area is blank for now fout.write(str(calcMass) + ',') # Mass fout.write(str(ppm) + ',') # ppm fout.write('' + ',') # PTM name is blank for now fout.write(localConfidenceString + ',') # local confidence (%) fout.write(peptide + ',') # tag (>=0%) fout.write('' + '\n') # mode is blank for now
`types` and of charges from 1 to `maxharge`. """ for i in xrange(1, len(peptide) - 1): for ion_type in types: for charge in xrange(1, maxcharge + 1): if ion_type[0] in 'abc': yield mass.fast_mass(peptide[:i], ion_type=ion_type, charge=charge) else: yield mass.fast_mass(peptide[i:], ion_type=ion_type, charge=charge) with mgf.read('example.mgf') as spectra, pepxml.read( 'example.pep.xml') as psms: spectrum = next(spectra) psm = next(psms) pylab.figure() pylab.title('Theoretical and experimental spectra for ' + psm['search_hit'][0]['peptide']) pylab.xlabel('m/z, Th') pylab.ylabel('Intensity, rel. units') pylab.bar(spectrum['m/z array'], spectrum['intensity array'], width=0.1, linewidth=2, edgecolor='black') theor_spectrum = list( fragments(psm['search_hit'][0]['peptide'], maxcharge=psm['assumed_charge']))
def get_from_pepxmlfile(self, pepxmlfile, min_charge=1, max_charge=0, allowed_peptides=False, prefix='DECOY_', FDR_type=None, termini=set([2,1,0])): if allowed_peptides: allowed_peptides_set = set([x.strip() for x in open(allowed_peptides)]) try: pepxml_params = {k: v for d in pepxml.iterfind(pepxmlfile, 'parameter name', read_schema=False) for k, v in d.items()} self.total_number_of_peptides_in_searchspace = int(pepxml_params.get('modelling, total peptides used', self.total_number_of_peptides_in_searchspace)) self.total_number_of_proteins_in_searchspace = int(pepxml_params.get('modelling, total proteins used', self.total_number_of_proteins_in_searchspace)) self.total_number_of_spectra = int(pepxml_params.get('modelling, total spectra used', self.total_number_of_spectra)) except Exception as e: logger.critical('Error reading pepXML file: %s, %s ', e, e.args) return 0 best_scores = {} standard_aminoacids = set(k for k in mass.std_aa_comp if '-' not in k) first_psm = True for record in pepxml.read(pepxmlfile, read_schema=False): if 'search_hit' in record: if int(min_charge) <= int(record['assumed_charge']) and (int(record['assumed_charge']) <= int(max_charge) or not max_charge): if first_psm: if 'num_missed_cleavages' not in record['search_hit'][0]: logger.warning('Missed cleavages are missing in pepXML file, using 0 for all peptides') try: float(record['retention_time_sec']) except: try: float(record['spectrum'].split(',')[2].split()[0]) except Exception: logger.warning('RT experimental is missing in pepXML file, using 0 value for all peptides') first_psm = False if 'peptide' in record['search_hit'][0]: sequence = record['search_hit'][0]['peptide'] num_tol_term_tmp = record['search_hit'][0]['proteins'][0]['num_tol_term'] if num_tol_term_tmp in termini and (not allowed_peptides or sequence in allowed_peptides_set): try: evalue = record['search_hit'][0]['search_score']['expect'] # evalue = 1/record['search_hit'][0]['search_score']['hyperscore'] except: try: evalue = 1.0 / float(record['search_hit'][0]['search_score']['ionscore']) except IOError: 'Cannot read e-value!' tags = {} for k in record['search_hit'][0]['search_score'].keys(): if k.startswith('tmt'): tags[k] = float(record['search_hit'][0]['search_score'][k]) if not (FDR_type.startswith('peptide') and best_scores.get(sequence, 1e6) < evalue) and not set(sequence).difference(standard_aminoacids): if FDR_type.startswith('peptide'): best_scores[sequence] = evalue mc = record['search_hit'][0].get('num_missed_cleavages', 0) modifications = record['search_hit'][0]['modifications'] try: sumI = 10 ** float(record['search_hit'][0]['search_score']['sumI']) except: sumI = 0 try: frag_mt = float(record['search_hit'][0]['search_score']['fragmentMT']) except: frag_mt = None spectrum = record['spectrum'] pcharge = record['assumed_charge'] mass_exp = record['precursor_neutral_mass'] if pepxmlfile not in infiles_dict: infiles_dict[pepxmlfile] = len(infiles_dict) infile_current = infiles_dict[pepxmlfile] pept = Peptide(sequence=sequence, settings=self.settings, evalue=evalue, pcharge=pcharge, mass_exp=mass_exp, modifications=modifications, modification_list=self.modification_list, custom_aa_mass=self.aa_list, sumI=sumI, mc=mc, infile=infile_current, frag_mt=frag_mt, tags=tags) try: RT_exp = float(record['retention_time_sec']) / 60 except: try: RT_exp = float(spectrum.split(',')[2].split()[0]) except: RT_exp = 0 if not all(protein['protein'].startswith(prefix) for protein in record['search_hit'][0]['proteins']): pept.note = 'target' else: pept.note = 'decoy' pept.num_tol_term = num_tol_term_tmp pept.next_aa = record['search_hit'][0]['proteins'][0]['peptide_next_aa'] pept.prev_aa = record['search_hit'][0]['proteins'][0]['peptide_prev_aa'] if pept.sequence not in self.proteins_dict: for prot in record['search_hit'][0]['proteins']: prot_name = get_dbname(prot) if prot_name not in [protein.dbname for protein in self.proteins_dict[pept.sequence]]: #pept.num_tol_term = prot['num_tol_term'] self.proteins_dict[pept.sequence].append(Protein(dbname=get_dbname(prot), description=prot.get('protein_descr', None))) #pept.parentproteins.append(Protein(dbname=get_dbname(prot), description=prot.get('protein_descr', None))) if len(self.proteins_dict[pept.sequence]) and (not modifications or Counter(v['position'] for v in modifications).most_common(1)[0][1] <= 1): self.add_elem((pept, spectrum, RT_exp)) # self.peptideslist.append(pept) # self.spectrumlist.append(spectrum) self.spectrumlist = np.array(self.spectrumlist) if FDR_type.startswith('peptide'): js = [] j = len(self.peptideslist) - 1 while j >= 0: if self.peptideslist[j].evalue > best_scores.get(self.peptideslist[j].sequence, 1e6): js.append(j) j -= 1 self.rem_elements(js)
# converter of protxml files to csv INCLUDING details from StPeter from pyteomics import protxml, pepxml from math import nan import pandas as pd pp = protxml.read("StPeterOut.prot.xml") pepp = pepxml.read("Sample.pep.xml") pdata = protxml.DataFrame(pp) pepdata = pepxml.DataFrame(pepp) pepdata["Quantification_SI"] = nan def extract_stpeter(analysis): if isinstance(analysis, list): a = analysis[0] if (len(a) == 2): b = a.get("StPeterQuant") SI = b.get("SI") SIn = b.get("SIn") peps = b.get("StPeterQuant_peptide") peptable = pd.DataFrame(peps) pepseqs = peptable.get("sequence").to_string() # adding SI values to peptide table for index, r in peptable.iterrows(): pepdata.at[pepdata["modified_peptide"] == r.get("sequence"), "Quantification_SI"] = r.get("SI") pepSI = peptable.get("SI").to_string() return ({ "analysis": a.get("analysis"), "SI": SI,