Esempio n. 1
0
def get_PSM_pepxml(psm_file):
    '''
    :param psm_file: psm file
    :return: dictionairy of psms
    '''
    PSM = []
    PEP = pepxml.read(psm_file, read_schema=False, iterative=True)
    # count = 0
    # parse tags out of protein IDs
    for row in PEP:
        # adjust search scores
        if 'search_hit' in row.keys():
            for search_hit in row['search_hit']:
                search_hit['massdiff'] = str(
                    search_hit['massdiff']) + ';' + str(
                        search_hit['calc_neutral_pep_mass']) + ';' + str(
                            row['precursor_neutral_mass'])
                search_hit['search_score'] = {
                    'score': _get_score_(search_hit['search_score']),
                    'evalue': _get_evalue_(search_hit['search_score'])
                }

        # if count==5500:
        #    break
        PSM.append(row)
        # count+=1
    del PEP
    return PSM
Esempio n. 2
0
    def iprophetpepxml_csv(infile, outfile):
        """
        :param infile: input pepxml
        :param outfile: outcsv
        :return:
        """
        # outfile = os.path.splitext(infile)[0] + '.csv'
        reader = pepxml.read(infile)
        f = open(outfile, 'wb')
        writer = csv.writer(f, delimiter='\t')
        # modifications_example = [{'position': 20, 'mass': 160.0306}]

        header_set = False

        nr_rows = 0
        result = {}
        for hit in reader:
            if 'error_point' in hit:
                print (hit)

#wenguang: remove all decoy hits!
            if 'search_hit' in hit and hit['search_hit'][0]['proteins'][0]['protein'].find("DECOY") == -1:
                #continue
                #else :
                # result = hit#
                #print(hit['search_hit'][0]['proteins'][0]['protein'])
                nr_rows +=1
                result['retention_time_sec'] = hit['retention_time_sec']
                result['assumed_charge'] = hit['assumed_charge']
                result['spectrum'] = hit['spectrum']
                result['nrhit'] = len(hit['search_hit'])
                search_hit = hit['search_hit'][0]

                result['modified_peptide'] = search_hit['modified_peptide']
                result['search_hit'] = search_hit['peptide']
                analysis_result = search_hit['analysis_result'][1]
                iprophet_probability = analysis_result['interprophet_result']['probability']
                result['iprophet_probability'] = iprophet_probability
                result['protein_id'] = search_hit['proteins'][0]['protein']
                result['nrproteins'] = len(search_hit['proteins'])
                if not header_set:
                    writer.writerow(result.keys())
                    header_set = True
                writer.writerow(result.values())
        print(nr_rows)
        f.close()
 def digest_pepxml(self):
     with pepxml.read(self.path) as psms:
         for psm in psms:
             psm_result = []
             if 'search_hit' in psm:
                 psm_result.append(psm['spectrum'])
                 psm_result.append(psm['search_hit'][0]['peptide'])
                 psm_result.append(psm['search_hit'][0]['search_score']['ionscore'])
                 psm_result.append(len(psm['search_hit'][0]['proteins']))
                 psm_result.append(psm['search_hit'][0]['num_matched_ions'])
             else:
                 psm_result.append(psm['spectrum'])
                 psm_result.append('')
                 psm_result.append(0)
                 psm_result.append(0)
                 psm_result.append(0)
             self.result.append(psm_result)
Esempio n. 4
0
def getFileFractionMappingFromPepXML(pepXMLFilepath):
    fileList = []
    fileFractionMapping = []
    with pepxml.read(pepXMLFilepath, read_schema=False) as reader:
        #auxiliary.print_tree(next(reader))
        for currSpectrum in reader:
            if not currSpectrum['spectrum'] in fileList:
                fileList.append(currSpectrum['spectrum'])
                #print '** Added %s to list of data files' % (currSpectrum['spectrum'])

    sortedFileList = sorted(fileList)
    currFractionNum = 1
    for currDataFile in sortedFileList:
        fileFractionMapping.append(
            tuple([str(currFractionNum).zfill(2), currDataFile]))
        currFractionNum += 1

    return fileFractionMapping
Esempio n. 5
0
 def getInfoFromPepXml(self):
     count_good = 0
     count_bad = 0
     with pepxml.read(self.pathXml) as psms:
         for psm in psms:
             if 'search_hit' in psm:
                 score = psm['search_hit'][0]['search_score']['ionscore']
                 if score > self.goodSpectraCriteria:
                     count_good += 1
                     self.goodBadUgly['good'].append(psm['spectrum'])
                 else:
                     count_bad += 1
                     self.goodBadUgly['bad'].append(psm['spectrum'])
             else:
                 count_bad += 1
                 self.goodBadUgly['bad'].append(psm['spectrum'])
     print("good: ", count_good)
     print("bad: ", count_bad)
Esempio n. 6
0
def get_PSM_pepxml(psm_file):
    PSM = []
    PEP = pepxml.read(psm_file, read_schema=False, iterative=True)
    # count = 0
    # parse tags out of protein IDs
    for row in PEP:
        # adjust search scores
        if 'search_hit' in row.keys():
            for search_hit in row['search_hit']:
                search_hit['search_score']={'score':_get_score_(search_hit['search_score']),
                                                        'evalue':_get_evalue_(search_hit['search_score'])}

        # if count==5500:
        #    break
        PSM.append(row)
        # count+=1
    del PEP
    return PSM
    def iprophetpepxml_csv(infile, outfile):
        """
        :param infile: input pepxml
        :param outfile: outcsv
        :return:
        """
        # outfile = os.path.splitext(infile)[0] + '.csv'
        reader = pepxml.read(infile)
        f = open(outfile, "wb")
        writer = csv.writer(f, delimiter="\t")
        # modifications_example = [{'position': 20, 'mass': 160.0306}]

        header_set = False

        nr_rows = 0
        result = {}
        for hit in reader:
            if "error_point" in hit:
                print(hit)

            if "search_hit" in hit:
                # continue
                # else :
                # result = hit
                nr_rows += 1
                result["retention_time_sec"] = hit["retention_time_sec"]
                result["assumed_charge"] = hit["assumed_charge"]
                result["spectrum"] = hit["spectrum"]
                result["nrhit"] = len(hit["search_hit"])
                search_hit = hit["search_hit"][0]

                result["modified_peptide"] = search_hit["modified_peptide"]
                result["search_hit"] = search_hit["peptide"]
                analysis_result = search_hit["analysis_result"][1]
                iprophet_probability = analysis_result["interprophet_result"]["probability"]
                result["iprophet_probability"] = iprophet_probability
                result["protein_id"] = search_hit["proteins"][0]["protein"]
                result["nrproteins"] = len(search_hit["proteins"])
                if not header_set:
                    writer.writerow(result.keys())
                    header_set = True
                writer.writerow(result.values())
        print(nr_rows)
        f.close()
#Dec 8th 2014, Avinash Shanmugam
#Script will parse a pepxml file and output as a tsv file

import sys
from pyteomics import pepxml, auxiliary

if len(sys.argv) != 2:

	print "USAGE: python pepxmlparse.py <pepxmlfile>";
	sys.exit();

pepxfile = sys.argv[1];
outfile = pepxfile.replace(".xml","xml.parse.tsv");

#Create pepxml reading iterator
pepxitr = pepxml.read(pepxfile, read_schema=False);

#Open outfile
out = open(outfile,"w");

#Create and write title line to outfile
titleLine = "\t".join(["peptide","modpeptide","specid","mass","charge", "iniprob","pprophprob","isfwd","protidString"]);
out.write(titleLine+"\n");

lineNo = 0;

#Iterate through the pepxml recoprds 

for pepxrec in pepxitr:

	#Extract needed vals from the dict returned
Esempio n. 9
0
    def read_psms(self):

        #### Set up information
        t0 = timeit.default_timer()
        stats = {'n_psms': 0}

        #### Show information
        if self.verbose >= 1:
            eprint(f"INFO: Reading pepXML file {self.pepxml_file}")
            progress_intro = False

        #### If the pepXML is gzipped, then open with zlib, else a plain open
        match = re.search('\.gz$', self.pepxml_file)
        if match:
            infile = gzip.open(self.pepxml_file)
        else:
            infile = open(self.pepxml_file, 'rb')

        #### Print a header
        print("\t".join([
            'scan', 'pool', 'PepProProb', 'iProProb', 'charge', 'PTMProProb',
            'isSeqInDataset', 'isSeqInPool', 'IsPepformInPool',
            'SameNoPhosphos', 'HasAPhospho', 'sequence', 'RefPepform',
            'CalcPepform', 'PTMProProbsString', 'USI'
        ]))

        #### Read psms from the file
        with pepxml.read(infile) as reader:
            for psm in reader:

                peptideprophet_probability = None
                iprophet_probability = None
                keep = False
                mean_best_probability = -1
                peptide_str = 'xx'
                pool = '?'
                msrun_name = '?'

                #print(psm)
                sequence = psm['search_hit'][0]['peptide']
                charge = psm['assumed_charge']
                spectrum_name = psm['spectrum']
                match = re.search(r"_(pool\d)_", spectrum_name)
                if match:
                    pool = match.group(1)
                match = re.match(r"(.+)\.\d+\.\d+\.\d+$", spectrum_name)
                if match:
                    msrun_name = match.group(1)

                for analysis_result in psm['search_hit'][0]['analysis_result']:
                    if analysis_result['analysis'] == 'peptideprophet':
                        peptideprophet_probability = analysis_result[
                            'peptideprophet_result']['probability']
                    if analysis_result['analysis'] == 'interprophet':
                        iprophet_probability = analysis_result[
                            'interprophet_result']['probability']
                    if analysis_result['analysis'] == 'ptmprophet':
                        #print(analysis_result)
                        #print(analysis_result['ptmprophet_result']['ptm'][0:3])
                        if analysis_result['ptmprophet_result']['ptm'][
                                0:3] == 'STY':
                            peptide_str = analysis_result['ptmprophet_result'][
                                'ptm_peptide']
                            mean_best_probability = analysis_result[
                                'ptmprophet_result']['parameter'][
                                    'mean_best_prob']
                if iprophet_probability is not None and iprophet_probability >= 0.90:
                    keep = True
                if mean_best_probability < 0.9:
                    keep = False

                #### Generate a peptidoform in proper notation
                peptidoform = '??????'
                phospho_peptidoform = '??????'
                has_alanine_phospho = 'N'
                n_phosphos = 0
                #print(psm['search_hit'][0])
                if 'modifications' in psm['search_hit'][0]:
                    #print(psm['search_hit'][0]['modifications'])
                    residues = list(sequence)
                    phospho_residues = list(sequence)
                    nterm = ''
                    for modification in psm['search_hit'][0]['modifications']:
                        offset = modification['position']
                        #                        if 'variable' in modification:
                        #                            if abs( modification['variable'] - 79.966 ) < 0.01:
                        #                                residues[offset-1] += '[Phospho]'
                        #                            elif abs( modification['variable'] - 15.9949 ) < 0.01:
                        #                                residues[offset-1] += '[Hydroxylation]'
                        #                            elif abs( modification['variable'] - 0.984 ) < 0.01:
                        #                                residues[offset-1] += '[Deamidation]'
                        #                            elif abs( modification['variable'] - (-17.026) ) < 0.01:
                        #                                residues[offset-1] += '[Pyro-glu]'
                        #                            elif abs( modification['variable'] - (-18.010) ) < 0.01:
                        #                                residues[offset-1] += '[Pyro_glu]'
                        #                            else:
                        #                                print(f"ERROR: Unable to translate {modification}")
                        if 'mass' in modification:

                            #### Phospho-only peptidoforms
                            if abs(modification['mass'] - 181.01401) < 0.01:
                                phospho_residues[offset - 1] += '[Phospho]'
                                n_phosphos += 1
                            elif abs(modification['mass'] - 243.0297) < 0.01:
                                phospho_residues[offset - 1] += '[Phospho]'
                                n_phosphos += 1
                            elif abs(modification['mass'] - 166.998359) < 0.01:
                                phospho_residues[offset - 1] += '[Phospho]'
                                n_phosphos += 1
                            elif abs(modification['mass'] - 151.003445) < 0.01:
                                phospho_residues[offset - 1] += '[Phospho]'
                                n_phosphos += 1
                                has_alanine_phospho = 'Y'

                            #### All-mod peptidforms
                            if abs(modification['mass'] - 160.030649) < 0.01:
                                #residues[offset-1] += '[Carbamidomethyl]'
                                pass
                            elif abs(modification['mass'] - 181.01401) < 0.01:
                                residues[offset - 1] += '[Phospho]'
                            elif abs(modification['mass'] - 243.0297) < 0.01:
                                residues[offset - 1] += '[Phospho]'
                            elif abs(modification['mass'] - 166.998359) < 0.01:
                                residues[offset - 1] += '[Phospho]'
                            elif abs(modification['mass'] - 151.003445) < 0.01:
                                residues[offset - 1] += '[Phospho]'
                            elif abs(modification['mass'] - 147.0354) < 0.01:
                                residues[offset - 1] += '[Hydroxylation]'
                            elif abs(modification['mass'] - 202.074213) < 0.01:
                                residues[offset - 1] += '[Hydroxylation]'
                            elif abs(modification['mass'] - 113.047664) < 0.01:
                                residues[offset - 1] += '[Hydroxylation]'
                            elif abs(modification['mass'] - 115.026943) < 0.01:
                                residues[offset - 1] += '[Deamidation]'
                            elif abs(modification['mass'] - 129.042594) < 0.01:
                                residues[offset - 1] += '[Deamidation]'
                            elif abs(modification['mass'] - 111.032029) < 0.01:
                                residues[offset - 1] += '[Pyro-glu]'
                            elif abs(modification['mass'] - 111.032029) < 0.01:
                                residues[offset - 1] += '[Pyro_glu]'
                            elif abs(modification['mass'] - 143.0041) < 0.01:
                                #residues[offset-1] += '[Carbamidomethyl]'
                                residues[offset - 1] += '[Pyro_glu]'
                            elif abs(modification['mass'] - 43.018425) < 0.01:
                                nterm = '[Acetyl]-'
                            else:
                                print(
                                    f"ERROR: Unable to translate {modification}"
                                )
                    peptidoform = nterm + ''.join(residues)
                    phospho_peptidoform = ''.join(phospho_residues)

                #print(peptidoform)
                is_sequence_in_dataset = 'N'
                if sequence in self.reference_peptides['by_sequence']:
                    is_sequence_in_dataset = 'Y'

                is_sequence_in_pool = 'N'
                pool_sequence = f"{pool}-{sequence}"
                if pool_sequence in self.reference_peptides[
                        'by_pool_sequence']:
                    is_sequence_in_pool = 'Y'

                same_number_of_phosmods = 'N'
                if pool_sequence in self.reference_peptides[
                        'by_pool_sequence']:
                    ref_n_mods = int(
                        self.reference_peptides['by_pool_sequence']
                        [pool_sequence][0]['n_mods'])
                    #eprint(f"{n_phosphos}, {ref_n_mods}, {type(n_phosphos)}, {type(ref_n_mods)}")
                    same_number_of_phosmods = f"{n_phosphos},{ref_n_mods}"
                    if n_phosphos == ref_n_mods:
                        same_number_of_phosmods = 'Y'

                is_peptidoform_in_pool = 'N'
                pool_peptidoform = f"{pool}-{phospho_peptidoform}"
                if pool_peptidoform in self.reference_peptides[
                        'by_pool_peptidoform']:
                    is_peptidoform_in_pool = 'Y'

                #### Find the reference peptide
                reference_peptidoform = '---------------------'
                if is_sequence_in_pool == 'Y':
                    reference_peptidoform = self.reference_peptides[
                        'by_pool_sequence'][pool_sequence][0]['peptidoform']

                if keep:
                    usi = f"mzspec:PXD007058:{msrun_name}:scan:{psm['start_scan']}:{peptidoform}/{charge}"
                    row = [
                        str(psm['start_scan']), pool,
                        str(peptideprophet_probability),
                        str(iprophet_probability),
                        str(charge),
                        str(mean_best_probability), is_sequence_in_dataset,
                        is_sequence_in_pool, is_peptidoform_in_pool,
                        same_number_of_phosmods, has_alanine_phospho, sequence,
                        reference_peptidoform, peptidoform, peptide_str, usi
                    ]
                    print("\t".join(row))

                #### Testing. Print the data structure of the first spectrum
                #if stats['n_psms'] >1000:
                #auxiliary.print_tree(psm)
                #sys.exit(10)

                #### Update counters and print progress
                stats['n_psms'] += 1
                if self.verbose >= 1:
                    if stats['n_psms'] / 1000 == int(stats['n_psms'] / 1000):
                        if not progress_intro:
                            eprint("INFO: Reading psms.. ", end='')
                            progress_intro = True
                        eprint(f"{stats['n_psms']}.. ", end='', flush=True)

        infile.close()
        if self.verbose >= 1: eprint("")

        #### Print final timing information
        t1 = timeit.default_timer()
        print(f"INFO: Read {stats['n_psms']} psms from {self.pepxml_file}")
        print(f"INFO: Elapsed time: {t1-t0}")
        print(f"INFO: Processed {stats['n_psms']/(t1-t0)} psms per second")
Esempio n. 10
0
"""
from __future__ import print_function

import sys
import os.path
#from msproteomicstoolslib.format import pepXMLReader
import csv
csv.field_size_limit(sys.maxsize)

from pyteomics import pepxml

infile  = sys.argv[1]

outfile = os.path.splitext(infile)[0] + '.csv'

reader = pepxml.read(infile)

writer = csv.writer(open(outfile, 'w'), delimiter='\t')

## MYRIMATCH
{
    'end_scan': 1380,
    'retention_time_sec': 5190.16999999998,
    'index': 160,
    'assumed_charge': 2,
    'spectrum': '5P_HDMSE_121214_20.1380.1380.2',
    'search_hit': [
        {
            'hit_rank': 1,
            'calc_neutral_pep_mass': 3123.6467143833,
            'modifications': [],
Esempio n. 11
0
    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt/10)
        decrement = True


from pyteomics import pepxml

infile  = sys.argv[1]

outfile = os.path.splitext(infile)[0] + '.csv'

reader = pepxml.read(infile)

writer = csv.writer(open(outfile, 'w'), delimiter='\t')

## MYRIMATCH
{
    'end_scan': 1380,
    'retention_time_sec': 5190.16999999998,
    'index': 160,
    'assumed_charge': 2,
    'spectrum': '5P_HDMSE_121214_20.1380.1380.2',
    'search_hit': [
        {
            'hit_rank': 1,
            'calc_neutral_pep_mass': 3123.6467143833,
            'modifications': [],
Esempio n. 12
0
def parse_xml_based_format_to_identification_table(
        path2XML_file: str,
        path2fastaDB: str,
        decoy_prefix: str = 'DECOY',
        is_idXML: bool = False,
        fasta_reader_param: Dict[str, str] = {
            'filter_decoy': True,
            'decoy_string': 'DECOY'
        },
        remove_if_not_matched: bool = True) -> pd.DataFrame:
    """parse either a pepXML or an idXML file to generate an identification table , 

    :param path2XML_file: The path to the input pepXML files
    :type path2XML_file: str
    :param path2fastaDB: The path to a fasta sequence database to obtain the protein sequences
    :type path2fastaDB: str
    :param decoy_prefix: the prefix of the decoy sequences, default is DECOY
    :type decoy_prefix: str, optional
    :param is_idXML: Whether or not the provided file is an idXML, default is false which assume the provided file is a pepXML file, defaults to False
    :type is_idXML: bool, optional
    :param fasta_reader_param: A dict of parameters for controlling the behavior of the fasta reader, defaults to {'filter_decoy':True, 'decoy_string':'DECOY' }
    :type fasta_reader_param: Dict[str,str], optional
    :param remove_if_not_matched: remove the peptide if it could not be matched to the parent protein, defaults to True
    :type remove_if_not_matched: bool, optional
    :raises IOError: if the fasta database could not be open 
    :raises ValueError: if the XML file can not be open 
    :raises KeyError: if a protein id defined in the mzTab file could not be extracted from a matched sequence database
    :raises ValueError: if the peptide can not be mapped to the identified protein 
    :return: the identification table 
    :rtype: pd.DataFrame
    """
    # load the fasta files and extract the accession of the proteins
    try:
        sequence_dict: Dict[str, str] = fasta2dict(path2fastaDB,
                                                   **fasta_reader_param)
    except Exception as exp:
        raise IOError(
            f'While parsing your input fasta file: {path2fastaDB}, the following error was encountered: {exp}'
        )
    # parse that the file exists:
    if not os.path.exists(path2XML_file):
        raise ValueError(f'The provided path: {path2XML_file} does not exist!')
    # allocate a list to hold peptide and protein list
    peptides: List[str] = []
    protein_acc: List[str] = []
    #  parse the XML file
    if is_idXML:
        with idxml.IDXML(path2XML_file) as reader:
            for elem in reader:
                for hit in elem['PeptideHit']:
                    for prot in hit['protein']:
                        if decoy_prefix not in prot['accession']:
                            peptides.append(hit['sequence'])
                            if '|' in prot['accession']:
                                protein_acc.append(
                                    prot['accession'].split('|')[1])
                            else:
                                protein_acc.append(prot['accession'])
    else:
        with pepxml.read(path2XML_file) as reader:
            for elem in reader:
                for hit in elem['search_hit']:
                    for protein in hit['proteins']:
                        if decoy_prefix not in protein['protein']:
                            peptides.append(hit['peptide'])
                            protein_acc.append(
                                protein['protein'].split('|')[1])
    # extract the start and end index of the peptides from the parent proteins
    start_index: List[int] = []
    end_index: List[int] = []
    # fill extract the start and end-index information from the library
    for idx in range(len(protein_acc)):
        # get the protein sequence
        try:
            prot_seq: str = sequence_dict[protein_acc[idx]]
        except KeyError as exp:
            raise KeyError(
                f'Database mismatch, the current protein accession: {protein_acc[idx]} is not defined in the provided sequence database'
            )
        # get the index of the protein sequence
        try:
            if '(' in peptides[
                    idx]:  # that is there sequence modifications in the sequence
                temp_peptide = peptides[
                    idx]  # that is there sequence modifications in the sequence
                while '(' in temp_peptide or ')' in temp_peptide:
                    pre_seq = temp_peptide.split('(')[0]
                    post_seq = ")".join(temp_peptide.split(')')[1:])
                    temp_peptide = pre_seq + post_seq
                start_index.append(prot_seq.index(temp_peptide))
                peptide_len = len(temp_peptide)
            else:
                start_index.append(prot_seq.index(peptides[idx]))
                peptide_len = len(peptides[idx])
        except ValueError as exp:
            if remove_if_not_matched:
                start_index.append(
                    -1)  #  add a placeholder value that will be dropped later
            else:
                raise ValueError(
                    f'Peptide sequence: {peptides[idx]} could not be extracted from protein sequence: {prot_seq} with accession: {protein_acc[idx]}'
                )
        # add the end index
        end_index.append(start_index[idx] + peptide_len)
    # build the data frame
    ident_table: pd.DataFrame = pd.DataFrame({
        'peptide': peptides,
        'protein': protein_acc,
        'start_index': start_index,
        'end_index': end_index
    })
    ident_table = ident_table.loc[ident_table.iloc[:, 2] !=
                                  -1, :]  # filter the non-matched peptides
    # return the results
    return ident_table
Esempio n. 13
0
def convertPepxmlToCSV(pepXMLFilepath,
                       csvOutputFilepath,
                       fractionMapping=None):
    protonMass = 1.007276466771
    with open(csvOutputFilepath, "w") as fout:
        with pepxml.read(pepXMLFilepath) as reader:
            #auxiliary.print_tree(next(reader))
            columnNames = []
            columnNames.append('Fraction')
            columnNames.append('Scan')
            columnNames.append('Source File')
            columnNames.append('Peptide')
            columnNames.append('Tag Length')
            columnNames.append('ALC (%)')
            columnNames.append('length')
            columnNames.append('m/z')
            columnNames.append('z')
            columnNames.append('RT')
            columnNames.append('Area')
            columnNames.append('Mass')
            columnNames.append('ppm')
            columnNames.append('PTM')
            columnNames.append('local confidence (%)')
            columnNames.append('tag (>=0%)')
            columnNames.append('mode')
            fout.write(','.join(columnNames) + '\n')

            for currSpectrum in reader:
                # Ok, lets gather the fields we need:
                # print currSpectrum['start_scan']
                fileName = currSpectrum['spectrum']
                fractionNum = 0
                if not fractionMapping is None:
                    for fractionFilePair in fractionMapping:
                        if fractionFilePair[1] == fileName:
                            fractionNum = int(fractionFilePair[0])

                searchHit = currSpectrum['search_hit']
                #print 'searchHit:'
                #print searchHit
                '''
				search_hit [list]
				-> hit_rank
				-> calc_neutral_pep_mass
				-> modifications
				-> modified_peptide
				-> peptide
				-> massdiff
				-> search_score
				->  -> positional_conf
				->  -> PeaksDenovoScore
				-> num_tot_proteins
				-> proteins [list]
				->  -> num_tol_term
				->  -> protein
				->  -> protein_descr
				'''
                searchScore = searchHit[0]['search_score']
                #print 'searchScore:'
                #print searchScore
                scan = currSpectrum['start_scan']
                #print scan
                PTM = ''
                peptide = searchHit[0]['peptide']
                #print peptide
                if searchHit[0]['modified_peptide'] != peptide:
                    peptide = searchHit[0]['modified_peptide']
                    PTM = 'Carbamidomethylation'

            # Remove anything between parentheses, to get rid of modifications from the peptide string
            # e.g. LLEGEEC(+57.02)R --> LLEGEECR
                strippedPeptide = re.sub(r'\([^)]*\)', '', peptide)

                tagLength = len(strippedPeptide)
                #print tagLength
                deNovoScore = int(100.0 *
                                  float(searchScore['PeaksDenovoScore']))
                #print deNovoScore
                z = int(currSpectrum['assumed_charge'])
                #print z
                precursorMass = float(currSpectrum['precursor_neutral_mass'])
                calcMass = float(searchHit[0]['calc_neutral_pep_mass'])
                #print m
                #scanData['Obs M+H'] = float(scan['m/z']) * int(scan['z']) - ((int(scan['z']) - 1) * Constants.mods['H+'])
                mOverZ = (precursorMass / float(z)) + protonMass
                #print mOverZ
                ppm = searchHit[0]['massdiff']
                RT = '0'  # '73.49'
                Area = '6.92E5'

                # Confidence must be converted from "0.96,0.98,0.99,0.99,0.99,0.99,0.99" in pepXML to "96 98 99 99 99 99 99" for CSV
                localConfidenceString = ''
                localConfidenceList = []
                positionalConfidenceString = searchScore['positional_conf']
                positionalConfidenceList = positionalConfidenceString.split(
                    ',')
                for currConfidenceDecimalScore in positionalConfidenceList:
                    localConfidenceList.append(
                        str(int(100.0 * float(currConfidenceDecimalScore))))
                localConfidenceString = ' '.join(localConfidenceList)

                # Now lets spit out the entry
                fout.write(str(fractionNum) + ',')  # Fraction
                fout.write(str(scan) + ',')  # Scan
                fout.write(fileName + ',')  # Source File
                fout.write(peptide + ',')  # Peptide
                fout.write(str(tagLength) + ',')  # Tag Length
                fout.write(str(deNovoScore) + ',')  # ALC (%)
                fout.write(str(tagLength) + ',')  # length
                fout.write(str(mOverZ) + ',')  # m/z
                fout.write(str(z) + ',')  # z
                fout.write(
                    RT + ','
                )  # RT is currently set to a bogus constant 0, as the information is not in the pepXML
                fout.write('' + ',')  # Area is blank for now
                fout.write(str(calcMass) + ',')  # Mass
                fout.write(str(ppm) + ',')  # ppm
                fout.write('' + ',')  # PTM name is blank for now
                fout.write(localConfidenceString + ',')  # local confidence (%)
                fout.write(peptide + ',')  # tag (>=0%)
                fout.write('' + '\n')  # mode is blank for now
Esempio n. 14
0
    `types` and of charges from 1 to `maxharge`.
    """
    for i in xrange(1, len(peptide) - 1):
        for ion_type in types:
            for charge in xrange(1, maxcharge + 1):
                if ion_type[0] in 'abc':
                    yield mass.fast_mass(peptide[:i],
                                         ion_type=ion_type,
                                         charge=charge)
                else:
                    yield mass.fast_mass(peptide[i:],
                                         ion_type=ion_type,
                                         charge=charge)


with mgf.read('example.mgf') as spectra, pepxml.read(
        'example.pep.xml') as psms:
    spectrum = next(spectra)
    psm = next(psms)
pylab.figure()
pylab.title('Theoretical and experimental spectra for ' +
            psm['search_hit'][0]['peptide'])
pylab.xlabel('m/z, Th')
pylab.ylabel('Intensity, rel. units')
pylab.bar(spectrum['m/z array'],
          spectrum['intensity array'],
          width=0.1,
          linewidth=2,
          edgecolor='black')
theor_spectrum = list(
    fragments(psm['search_hit'][0]['peptide'],
              maxcharge=psm['assumed_charge']))
Esempio n. 15
0
    def get_from_pepxmlfile(self, pepxmlfile, min_charge=1, max_charge=0, allowed_peptides=False, prefix='DECOY_', FDR_type=None, termini=set([2,1,0])):
        if allowed_peptides:
            allowed_peptides_set = set([x.strip() for x in open(allowed_peptides)])

        try:
            pepxml_params = {k: v for d in pepxml.iterfind(pepxmlfile, 'parameter name', read_schema=False) for k, v in d.items()}
            self.total_number_of_peptides_in_searchspace = int(pepxml_params.get('modelling, total peptides used', self.total_number_of_peptides_in_searchspace))
            self.total_number_of_proteins_in_searchspace = int(pepxml_params.get('modelling, total proteins used', self.total_number_of_proteins_in_searchspace))
            self.total_number_of_spectra = int(pepxml_params.get('modelling, total spectra used', self.total_number_of_spectra))
        except Exception as e:
            logger.critical('Error reading pepXML file: %s, %s ', e, e.args)
            return 0

        best_scores = {}
        standard_aminoacids = set(k for k in mass.std_aa_comp if '-' not in k)
        first_psm = True
        for record in pepxml.read(pepxmlfile, read_schema=False):
            if 'search_hit' in record:
                if int(min_charge) <= int(record['assumed_charge']) and (int(record['assumed_charge']) <= int(max_charge) or not max_charge):
                    if first_psm:
                        if 'num_missed_cleavages' not in record['search_hit'][0]:
                            logger.warning('Missed cleavages are missing in pepXML file, using 0 for all peptides')
                        try:
                            float(record['retention_time_sec'])
                        except:
                            try:
                                float(record['spectrum'].split(',')[2].split()[0])
                            except Exception:
                                logger.warning('RT experimental is missing in pepXML file, using 0 value for all peptides')
                        first_psm = False
                    if 'peptide' in record['search_hit'][0]:
                        sequence = record['search_hit'][0]['peptide']
                        num_tol_term_tmp = record['search_hit'][0]['proteins'][0]['num_tol_term']
                        if num_tol_term_tmp in termini and (not allowed_peptides or sequence in allowed_peptides_set):
                            try:
                                evalue = record['search_hit'][0]['search_score']['expect']
                                # evalue = 1/record['search_hit'][0]['search_score']['hyperscore']
                            except:
                                try:
                                    evalue = 1.0 / float(record['search_hit'][0]['search_score']['ionscore'])
                                except IOError:
                                    'Cannot read e-value!'
                            tags = {}
                            for k in record['search_hit'][0]['search_score'].keys():
                                if k.startswith('tmt'):
                                    tags[k] = float(record['search_hit'][0]['search_score'][k])
                            if not (FDR_type.startswith('peptide') and best_scores.get(sequence, 1e6) < evalue) and not set(sequence).difference(standard_aminoacids):
                                if FDR_type.startswith('peptide'):
                                    best_scores[sequence] = evalue
                                mc = record['search_hit'][0].get('num_missed_cleavages', 0)
                                modifications = record['search_hit'][0]['modifications']
                                try:
                                    sumI = 10 ** float(record['search_hit'][0]['search_score']['sumI'])
                                except:
                                    sumI = 0
                                try:
                                    frag_mt = float(record['search_hit'][0]['search_score']['fragmentMT'])
                                except:
                                    frag_mt = None
                                spectrum = record['spectrum']
                                pcharge = record['assumed_charge']
                                mass_exp = record['precursor_neutral_mass']

                                if pepxmlfile not in infiles_dict:
                                    infiles_dict[pepxmlfile] = len(infiles_dict)
                                infile_current = infiles_dict[pepxmlfile]
                                pept = Peptide(sequence=sequence, settings=self.settings, evalue=evalue, pcharge=pcharge, mass_exp=mass_exp, modifications=modifications, modification_list=self.modification_list, custom_aa_mass=self.aa_list, sumI=sumI, mc=mc, infile=infile_current, frag_mt=frag_mt, tags=tags)
                                try:
                                    RT_exp = float(record['retention_time_sec']) / 60
                                except:
                                    try:
                                        RT_exp = float(spectrum.split(',')[2].split()[0])
                                    except:
                                        RT_exp = 0

                                if not all(protein['protein'].startswith(prefix) for protein in record['search_hit'][0]['proteins']):
                                    pept.note = 'target'
                                else:
                                    pept.note = 'decoy'
                                pept.num_tol_term = num_tol_term_tmp
                                pept.next_aa = record['search_hit'][0]['proteins'][0]['peptide_next_aa']
                                pept.prev_aa = record['search_hit'][0]['proteins'][0]['peptide_prev_aa']

                                if pept.sequence not in self.proteins_dict:
                                    for prot in record['search_hit'][0]['proteins']:
                                        prot_name = get_dbname(prot)
                                        if prot_name not in [protein.dbname for protein in self.proteins_dict[pept.sequence]]:
                                            #pept.num_tol_term = prot['num_tol_term']
                                            self.proteins_dict[pept.sequence].append(Protein(dbname=get_dbname(prot), description=prot.get('protein_descr', None)))
                                            #pept.parentproteins.append(Protein(dbname=get_dbname(prot), description=prot.get('protein_descr', None)))

                                if len(self.proteins_dict[pept.sequence]) and (not modifications or Counter(v['position'] for v in modifications).most_common(1)[0][1] <= 1):
                                    self.add_elem((pept, spectrum, RT_exp))
                                    # self.peptideslist.append(pept)
                                    # self.spectrumlist.append(spectrum)

        self.spectrumlist = np.array(self.spectrumlist)

        if FDR_type.startswith('peptide'):
            js = []
            j = len(self.peptideslist) - 1
            while j >= 0:
                if self.peptideslist[j].evalue > best_scores.get(self.peptideslist[j].sequence, 1e6):
                    js.append(j)
                j -= 1
            self.rem_elements(js)
Esempio n. 16
0
# converter of protxml files to csv INCLUDING details from StPeter
from pyteomics import protxml, pepxml
from math import nan
import pandas as pd

pp = protxml.read("StPeterOut.prot.xml")
pepp = pepxml.read("Sample.pep.xml")
pdata = protxml.DataFrame(pp)
pepdata = pepxml.DataFrame(pepp)
pepdata["Quantification_SI"] = nan


def extract_stpeter(analysis):
    if isinstance(analysis, list):
        a = analysis[0]
        if (len(a) == 2):
            b = a.get("StPeterQuant")
            SI = b.get("SI")
            SIn = b.get("SIn")
            peps = b.get("StPeterQuant_peptide")
            peptable = pd.DataFrame(peps)
            pepseqs = peptable.get("sequence").to_string()
            # adding SI values to peptide table
            for index, r in peptable.iterrows():
                pepdata.at[pepdata["modified_peptide"] == r.get("sequence"),
                           "Quantification_SI"] = r.get("SI")

            pepSI = peptable.get("SI").to_string()
            return ({
                "analysis": a.get("analysis"),
                "SI": SI,