Exemple #1
0
    def build_db(self):
        """TODO(nh2tran): docstring."""

        print("".join(["="] * 80))  # section-separating line
        print("WorkerDB.build_db()")

        # parse the input fasta file into a list of sequences
        # more about SeqIO and SeqRecord: http://biopython.org/wiki/SeqRecord
        with open(self.db_fasta_file, "r") as handle:
            record_iterator = SeqIO.parse(handle, "fasta")
            record_list = list(record_iterator)
            sequence_list = [str(record.seq) for record in record_list]
            print("Number of protein sequences: {0:d}".format(
                len(sequence_list)))

        # cleave protein sequences into a list of unique peptides
        # more about pyteomics.parser.cleave and cleavage rules:
        # https://pythonhosted.org/pyteomics/api/parser.html
        peptide_set = set()
        for sequence in sequence_list:
            peptide_set.update(
                (parser.cleave(sequence=sequence,
                               rule=parser.expasy_rules[self.cleavage_rule],
                               missed_cleavages=self.num_missed_cleavage)))
        peptide_list = list(peptide_set)

        # skip peptides with undetermined amino acid 'X', or 'B'
        peptide_list = [
            list(peptide) for peptide in peptide_list
            if not ('X' in peptide or 'B' in peptide)
        ]
        peptide_count = len(peptide_list)
        print("Number of peptides: {0:d}".format(peptide_count))

        # replace "L" by "I"
        for index, peptide in enumerate(peptide_list):
            peptide = ['I' if x == 'L' else x for x in peptide]
            peptide_list[index] = peptide

        # update fixed modifications
        for index, peptide in enumerate(peptide_list):
            peptide = [
                x + 'mod' if x in self.fixed_mod_list else x for x in peptide
            ]
            peptide_list[index] = peptide

        # for each peptide, find the mass and the max modification mass
        peptide_mass_array = np.zeros(peptide_count)
        pepmod_maxmass_array = np.zeros(peptide_count)
        for index, peptide in enumerate(peptide_list):
            peptide_mass_array[index] = self._compute_peptide_mass(peptide)
            pepmod = [
                x + 'mod' if x in self.var_mod_list else x for x in peptide
            ]
            pepmod_maxmass_array[index] = self._compute_peptide_mass(pepmod)

        self.peptide_count = peptide_count
        self.peptide_list = peptide_list
        self.peptide_mass_array = peptide_mass_array
        self.pepmod_maxmass_array = pepmod_maxmass_array
def cleave_peptide(db,inp):
    dict_of_sequences={}
    for i in SeqIO.parse(open(db), "fasta"):
        sequence = str(i.seq)
        peptides = parser.cleave(sequence,parser.expasy_rules['trypsin'])
        size = [j for j in peptides if (len(j)>6 and len(j)<=35)]
        dict_of_sequences[i.id.split("|")[1]]=len(size)


    summed = {}
    for j1 in open(inp):
        if not j1.startswith("Sequence"):
            peptide, accession, intensity = j1.strip().split("\t")

            for k in accession.split(";"):
                if k in summed:
                    summed[k] += float(intensity)
                else:
                    summed[k]=0
                    summed[k] += float(intensity)

    for each in summed:
        lengths = dict_of_sequences.get(each,0)
        if lengths==0:
            ibaq=0
        else:
            ibaq = float(summed[each])/lengths
        try:
            print "\t".join([str(each), str(ibaq),str(math.log10(ibaq))])
        except:
            print "\t".join([str(each), str(ibaq),"0"])
  def build_db(self):
    """TODO(nh2tran): docstring."""

    print("".join(["="] * 80)) # section-separating line
    print("WorkerDB: build_db()")

    # parse the input fasta file into a list of sequences
    # more about SeqIO and SeqRecord: http://biopython.org/wiki/SeqRecord
    with open(self.db_fasta_file, "r") as handle:
      record_iterator = SeqIO.parse(handle, "fasta")
      record_list = list(record_iterator)
      print("Number of protein sequences: {0:d}".format(len(record_list)))

    # cleave protein sequences into a list of unique peptides
    # more about pyteomics.parser.cleave and cleavage rules:
    # https://pythonhosted.org/pyteomics/api/parser.html

    # create a peptide to protein accession id map.
    peptide_2_protein_id = {}
    for record in record_list:
      protein_sequence = str(record.seq)
      protein_id = str(record.id)
      cleaved_peptide_set = parser.cleave(
        sequence=protein_sequence,
        rule=parser.expasy_rules[self.cleavage_rule],
        missed_cleavages=self.num_missed_cleavage)
      for peptide in cleaved_peptide_set:
        if any(x in peptide for x in ['X', 'B', 'U', 'Z']):
          # skip peptides with undetermined amino acid ['X', 'B', 'U', 'Z']
          continue
        if peptide not in peptide_2_protein_id:
          peptide_2_protein_id[peptide] = {protein_id}
        else:
          peptide_2_protein_id[peptide].add(protein_id)

    peptide_list = [list(peptide) for peptide in peptide_2_protein_id.keys()]
    peptide_list = [[x + 'mod' if x in self.fixed_mod_list else x for x in peptide] for peptide in peptide_list ]

    peptide_count = len(peptide_list)
    print("Number of peptides: {0:d}".format(peptide_count))

    # for each peptide, find the mass and the max modification mass
    peptide_mass_array = np.zeros(peptide_count)
    pepmod_maxmass_array = np.zeros(peptide_count)
    for index, peptide in enumerate(peptide_list):
      peptide_mass_array[index] = self._compute_peptide_mass(peptide)
      pepmod = [x + 'mod' if x in self.var_mod_list else x for x in peptide]
      pepmod_maxmass_array[index] = self._compute_peptide_mass(pepmod)

    self.peptide_count = peptide_count
    self.peptide_list = peptide_list
    self.peptide_mass_array = peptide_mass_array
    self.pepmod_maxmass_array = pepmod_maxmass_array
    self.peptide_2_protein_id = peptide_2_protein_id
Exemple #4
0
    def print_target_decoy_composition(self):
        """
    Print the number of target peptides vs decoy peptides in a Fasta database
    :return:
    """
        target_aa_composition = {i: 0 for i in [aa for aa in PYGPATK_ALPHABET]}
        decoy_aa_composition = {i: 0 for i in [aa for aa in PYGPATK_ALPHABET]}
        target_sequence = ''
        decoy_sequence = ''

        fasta = SeqIO.parse(self._output_file, 'fasta')
        target_peptides = {}
        decoy_peptides = {}
        pep_count_in_both = 0
        for record in fasta:
            peptides = cleave(
                sequence=str(record.seq),
                rule=PYGPATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'],
                missed_cleavages=self._max_missed_cleavages,
                min_length=self._min_peptide_length)
            if self._decoy_prefix in record.id:
                decoy_sequence = decoy_sequence + str(record.seq)
            else:
                target_sequence = target_sequence + str(record.seq)
            for peptide in peptides:
                if self._decoy_prefix in record.id:
                    decoy_peptides[peptide] = 'decoy'
                    if peptide in target_peptides:
                        target_peptides.pop(peptide)
                        pep_count_in_both += 1
                else:
                    if peptide not in decoy_peptides:
                        target_peptides[peptide] = 'target'

        print('Number of target peptides: {} and Decoy Peptides: {}'.format(
            len(target_peptides), len(decoy_peptides)))
        target_percentage = (
            len(target_peptides) /
            (len(target_peptides) + len(decoy_peptides))) * 100
        print('% Target peptides {:.1f}'.format(target_percentage))
        decoy_percentage = (len(decoy_peptides) /
                            (len(target_peptides) + len(decoy_peptides))) * 100
        print('% Decoy peptides {:.1f}'.format(decoy_percentage))
        duplicate_percentage = (
            pep_count_in_both /
            (len(target_peptides) + len(decoy_peptides))) * 100
        print('Number of peptides in Target and Decoy {}, Percentage {:.1f}'.
              format(pep_count_in_both, duplicate_percentage))
        target_aa_composition = self.count_aa_in_dictionary(
            target_aa_composition, target_sequence)
        decoy_aa_composition = self.count_aa_in_dictionary(
            decoy_aa_composition, decoy_sequence)
        self.print_aa_composition_rate(target_aa_composition,
                                       decoy_aa_composition)
Exemple #5
0
def digetsProteinFromFASTA():
    sequenceIter = fasta.read(source=options.fasta)
    uniquePeptides = set()
    for s in sequenceIter:
        newPeptides = parser.cleave(s.sequence,
                                    'trypsin',
                                    missed_cleavages=options.missed,
                                    min_length=options.minLength)
        uniquePeptides.update(newPeptides)

    uniquePeptides = list(uniquePeptides)
    return [Peptide(x) for x in uniquePeptides]
Exemple #6
0
def digest(prot):
    # print(prot)
    dtype = [('seq', np.unicode_, MAX_LENGTH_OF_PEP), ('pcMass', float)]
    pepsWithMass = np.array([], dtype)

    peps = list(
        parser.cleave(prot,
                      parser.expasy_rules["trypsin"],
                      missed_cleavages=MISSED_CLEAVAGES,
                      min_length=MIN_LENGTH_OF_PEP))
    # if 'MESYHKPDQQK' in peps:
    #     print(peps)
    # if len(peps) != 0:
    #     firstPep = peps[0]
    #     if firstPep ==  'MESYHKPDQQK':
    #         print(firstPep)
    #     nTermPep = firstPep[0].lower() + firstPep[1:]
    #     peps.append(nTermPep)

    nTermPeps = []
    oxMPeps = []
    for pep in peps:
        # print(pep)
        # a = prot.index(pep)
        # if pep not in prot:
        #     print(pep)
        if prot.index(pep) == 0:
            nTermPep = pep[0].lower() + pep[1:]
            nTermPeps.append(nTermPep)
        elif 'M' in pep:
            indexs = [M.start() for M in re.finditer('M', pep)]

            for i in indexs:
                oxMPep = pep[0:i] + 'm' + pep[i + 1:]
                oxMPeps.append(oxMPep)

    peps.extend(nTermPeps)
    peps.extend(oxMPeps)
    peps = [
        pep for pep in peps
        if (len(pep) <= MAX_LENGTH_OF_PEP and 'B' not in pep and 'J' not in pep
            and 'X' not in pep and 'Z' not in pep and 'O' not in pep
            and 'U' not in pep)
    ]

    pepsWithMass = np.array([tuple([pep, calcuSeqMass(pep)]) for pep in peps],
                            dtype)
    return pepsWithMass
Exemple #7
0
 def test_cleave_semi(self):
     self.assertEqual(
         parser._cleave('PEPTIDEKS',
                        parser.expasy_rules['trypsin'],
                        semi=True),
         [
             'PEPTIDEK', 'P', 'PE', 'PEP', 'PEPT', 'PEPTI', 'PEPTID',
             'EPTIDEK', 'PTIDEK', 'TIDEK', 'IDEK', 'DEK', 'EK', 'K', 'S'
         ])
     self.assertEqual(
         parser.cleave('PEPTIDEKS',
                       parser.expasy_rules['trypsin'],
                       semi=True),
         {
             'PEPTIDEK', 'P', 'PE', 'PEP', 'PEPT', 'PEPTI', 'PEPTID',
             'EPTIDEK', 'PTIDEK', 'TIDEK', 'IDEK', 'DEK', 'EK', 'K', 'S'
         })
Exemple #8
0
def digest(prot):

    peps = list(
        parser.cleave(prot,
                      parser.expasy_rules["trypsin"],
                      missed_cleavages=MISSED_CLEAVAGES,
                      min_length=MIN_LENGTH_OF_PEP))

    peps = [
        pep for pep in peps
        if (len(pep) <= MAX_LENGTH_OF_PEP and 'B' not in pep and 'J' not in pep
            and 'X' not in pep and 'Z' not in pep and 'O' not in pep
            and 'U' not in pep)
    ]
    dropMPeps = []
    variableModPeps = []
    anyNtermQGlu2PyrogluPeps = []
    protNtermAcetylPeps = []

    # m Drop M
    for pep in peps:
        if pep[0] == 'M' and prot.find(pep) == 0:
            dropMPeps.append(pep[1:])
    peps.extend(dropMPeps)

    # prot nTerm Acetyl
    for pep in peps:
        if prot.find(pep) == 0:
            protNtermAcetylPeps.append('5' + pep)
    for pep in dropMPeps:
        protNtermAcetylPeps.append('5' + pep)
    # peps.extend(protNtermAcetylPeps)

    # pep nterm Q to pyro Q acid
    for pep in peps:
        if pep[0] == 'Q' and prot.find(pep) != 0:
            anyNtermQGlu2PyrogluPeps.append('q' + pep[1:])
    peps.extend(anyNtermQGlu2PyrogluPeps)

    # variable mod M oxidi.
    for pep in peps:
        variableModPeps.extend(getVariableModPeps('M', 'm', pep))

    peps.extend(variableModPeps)

    return peps
Exemple #9
0
def digest_proteins(fasta_file):
    protein_df = pd.DataFrame()  # Initialize a dataframe to store results
    counter = 0
    for protein in tqdm(SeqIO.parse(fasta_file, "fasta")):

        protein_sequence = str(protein.seq).upper()

        # cleave initial "start" methionine if present
        if protein_sequence[0] == "M":
            protein_sequence = protein_sequence[1:]

        # digest proteins into peptides following Arg-C cleavage (r'R')
        # glu-c == r'[DE]'
        # peptides = list(parser.cleave(protein_sequence, parser.expasy_rules['arg-c']))
        peptides = list(parser.cleave(protein_sequence, '[DE]'))

        # get the aa residue positions for the peptides to later link residue PTM positions to the peptide
        initial_index = []
        terminal_index = []
        for pep in peptides:
            initial_aa_pos = 1 + int(protein_sequence.index(
                pep))  # +1 for zero-based array indexing
            terminal_aa_pos = initial_aa_pos + len(pep)
            initial_index.append(initial_aa_pos)
            terminal_index.append(terminal_aa_pos)

        # add this protein to the dataframe
        new_df = pd.DataFrame({
            'protein': [protein.id] * len(peptides),
            'peptide_sequence': peptides,
            'initial_aa_index': initial_index,
            'terminal_aa_index': terminal_index
        })

        protein_df = protein_df.append(new_df)

    protein_df = protein_df[
        protein_df['peptide_sequence'].notna()]  # drop any nan peptides(?)
    protein_df = protein_df[protein_df['peptide_sequence'].str.len() >=
                            4]  # drop peptides <4 aa
    protein_df = protein_df[protein_df['peptide_sequence'].str.len() <=
                            40]  # drop peptides >40 aa

    sys.stdout.write("Finished in silico digestion and generating peptides.\n")

    return (protein_df)
def read_fasta_sequences(fasta_file):
    """ Read sequence records from a FASTA file. """
    sequence_records = []
    for description, sequence in fasta.read(fasta_file):
        # Initialize sequence record with sequence string.
        sequence_record = {'sequence': sequence}

        # Get sequence info.
        description_parts = description.split()
        sequence_record['id'] = description_parts[0]

        # Get the sequence's peptides.
        sequence_record['peptides'] = parser.cleave(
            sequence, 
            parser.expasy_rules['trypsin'],
            missed_cleavages=1 #max no. of missed cleavages.
        )

        # Save the sequence record, keyed by the id.
        sequence_records.append(sequence_record)

    return sequence_records
Exemple #11
0
def prot_to_peprec(protein):
    params = get_params()
    tmp = pd.DataFrame(
        columns=['spec_id', 'peptide', 'modifications', 'charge'])
    pep_count = 0
    for peptide in cleave(str(protein.seq), expasy_rules['trypsin'],
                          params['missed_cleavages']):
        if False not in [
                aa not in peptide for aa in ['B', 'J', 'O', 'U', 'X', 'Z']
        ]:
            if params['min_peplen'] <= len(peptide) < int(
                    params['max_pepmass'] / 186 + 2):
                if not mass.calculate_mass(
                        sequence=peptide) > params['max_pepmass']:
                    pep_count += 1
                    row = {
                        'spec_id': '{}_{:03d}'.format(protein.id, pep_count),
                        'peptide': peptide,
                        'modifications': '-',
                        'charge': np.nan
                    }
                    tmp = tmp.append(row, ignore_index=True)
    return tmp
Exemple #12
0
     geneError += 1
     gError.append(identifier)
 match4 = re.search(r'\\PE=(.+?) \\', record.description)
 if match4:
     pe = match4.group(1)
 else:
     pe = ''
     print(f'WARNING: Unable to find PE value in {record.description}')
 match5 = re.search(r'\((.{0,5})\|(.{0,5})\|PEFF:\d+\|mature protein\)',
                    record.description)
 x = len(
     re.findall(r'\((.{0,5})\|(.{0,5})\|PEFF:\d+\|mature protein\)',
                record.description))
 if x > 1:
     mature = fixed_sequence
     peptides = parser.cleave(mature, 'trypsin')
     for peptide in peptides:
         if 9 <= len(peptide) <= 40:
             trypnum += 1
 if match5:
     if match5.group(1) == '?' or match5.group(2) == '?':
         matureUnknown += 1
         mUnknown.append(identifier)
         mature = fixed_sequence
         peptides = parser.cleave(mature, 'trypsin')
         for peptide in peptides:
             if 9 <= len(peptide) <= 40:
                 trypnum += 1
     else:
         start = int(match5.group(1))
         end = int(match5.group(2))
Exemple #13
0
"""
Created on Fri Mar 22 11:28:43 2013

@author: ilya
"""

from pyteomics import fasta, mgf, parser
import pylab

fasta_file = '/home/ilya/src/pyteomics/RhoEcoli.fasta'
mgf_file = '/home/ilya/src/pyteomics/MultiConsensus.mgf'

peptides = set()
with open(fasta_file) as fi:
    for description, sequence in fasta.read(fi):
        new_peptides = parser.cleave(sequence, parser.expasy_rules['trypsin'])
        peptides.update(new_peptides)
        
print "UNIQUE PEPTIDES"
print peptides

with open(mgf_file) as fi:
    for spectrum in mgf.read(fi):
        pylab.figure()
        pylab.xlabel('m/z, Th')
        pylab.ylabel('Intensity, rel.units')
        pylab.bar(spectrum['m/z array'], spectrum['intensity array'], width=0.1, linewidth=2, edgecolor='black')
        pylab.show()
        inp = raw_input("Show more?")
        if inp != "yes":
            break;
Exemple #14
0
    def pypgatk_decoy_database(self):
        """
    Create a decoy database from a proteomics database
    target db is digested and only digested peptides > _peptide_length are kept
    next, each target protein is reversed and digested, all peptides are kept
    regardless of their length.
    The list of digested peptides from the reversed protein are iterated:
     - small peptides are kept (len < _peptide_length)
     - peptides not found in target are kep
     - peptides with a match are shuffled for max_iterations, if a non-target
       peptide was found then written otherwise the peptide is skipped unless
      the _keep_target_hits option is true.
     :return:
    """

        # Create empty sets to add all target and decoy peptides
        upeps = set()
        noAlternative = set()
        # Open FASTA file using first cmd line argument
        fasta = SeqIO.parse(self._input_fasta, 'fasta')
        # loop each seq in the file
        for record in fasta:
            seq = str(record.seq)
            if not self._isobaric:
                seq = seq.replace('I', 'L')

                # digest sequence add peptides to the target set
                upeps.update(
                    cleave(sequence=seq,
                           rule=PYGPATK_ENZYMES.enzymes[self._enzyme]
                           ['cleavage rule'],
                           missed_cleavages=self._max_missed_cleavages,
                           min_length=self._min_peptide_length))

        # open orary decoy FASTA file
        with open(self._output_file, 'w') as outfa:
            fasta = SeqIO.parse(self._input_fasta, 'fasta')
            targets = []
            decoys = []
            for i, record in enumerate(fasta):
                protseq = str(record.seq)
                targets.append(protseq)
                revprotseq = []

                # output target protein
                seq = str(record.seq)
                id_protein = record.id
                description = record.description
                outfa.write('>' + id_protein + ' ' + description + '\n')
                outfa.write(seq + '\n')

                for seq in protseq.split('*'):
                    if not seq:
                        continue
                    if not self._isobaric:
                        seq = seq.replace('I', 'L')

                    # reverse and switch protein sequence
                    decoyseq = self.revswitch(
                        seq, self._no_switch, PYGPATK_ENZYMES.enzymes[
                            self._enzyme]['cleavage sites'])

                    decoy_peps = cleave(sequence=decoyseq,
                                        rule=PYGPATK_ENZYMES.enzymes[
                                            self._enzyme]['cleavage rule'],
                                        missed_cleavages=0,
                                        min_length=0)

                    # if any of the digested peptides are found in the targets (upeps) then shuffle
                    checked_decoy_peps = []
                    for decoy_pep in decoy_peps:
                        if len(decoy_pep) < self._min_peptide_length:
                            checked_decoy_peps.append(decoy_pep)
                            continue

                        found_in_target = False
                        aPep = ''
                        if decoy_pep in upeps:
                            found_in_target = True
                        else:
                            checked_decoy_peps.append(decoy_pep)
                            continue

                        if found_in_target and not self._no_suffle and decoy_pep not in noAlternative:
                            aPep = decoy_pep
                            # shuffle until aPep is not in target set (maximum of 10 iterations)
                            i = 0
                            while aPep in upeps and i < self._max_iterations:
                                # increment iteration counter
                                i += 1
                                # shuffle peptide
                                aPep = self.shuffle(aPep)

                                # check if shuffling has an effect if not end iterations
                                if aPep == decoy_pep:
                                    i = self._max_iterations

                                # warn if peptide has no suitable alternative, add to removal list
                                if i == self._max_iterations:
                                    noAlternative.add(decoy_pep)
                                    aPep = ''
                        # if decoy is generated then add to the list of peptides
                        if aPep:
                            checked_decoy_peps.append(aPep)
                        else:
                            if self._keep_target_hits:
                                checked_decoy_peps.append(decoy_pep)
                    # finally join the peptides to generate protein decoy
                    if checked_decoy_peps:
                        revprotseq.append(''.join(checked_decoy_peps))

                outfa.write('>{}\n{}\n'.format(
                    self._decoy_prefix + str(record.id) + ' ' +
                    record.description, '*'.join(revprotseq)))
                decoys.append('*'.join(revprotseq))

            with open(
                    self._output_file.replace('.fa', '') + '_noAlternative.fa',
                    'w') as noAlternative_outfa:
                noAlternative_outfa.write('\n'.join(noAlternative) + '\n')
            print(
                'Number of skipped tryptic peptides in decoy db (no alternatives): {}'
                .format(len(noAlternative)))
            print(
                'Total number of amino acids in target and decoy databases: ',
                len(''.join(targets)), len(''.join(decoys)))
Exemple #15
0
    def generate_decoypyrat_database(self):
        """
    Create a decoy database from a proteomics database this method is presented in manuscript:
    J Proteomics Bioinform. 2016 Jun 27; 9(6): 176–180. PMCID: PMC4941923
    DecoyPyrat: Fast Non-redundant Hybrid Decoy Sequence Generation for Large Scale Proteomics
    :return:
    """

        # Create empty sets to add all target and decoy peptides
        upeps = set()
        dpeps = set()

        # Counter for number of decoy sequences
        dcount = 0

        # Open FASTA file using first cmd line argument
        # fasta = SeqIO.parse(self._input_fasta, 'fasta')

        with open(self._input_fasta) as handle:
            # open temporary decoy FASTA file
            with open(self._temp_file, 'w') as outfa:

                # loop each seq in the file
                for value in SimpleFastaParser(handle):
                    seq = value[1]
                    description = value[0]
                    dcount += 1
                    # make sequence isobaric (check args for switch off)
                    if not self._isobaric:
                        seq = seq.replace('I', 'L')

                    # digest sequence add peptides to set
                    upeps.update(
                        cleave(sequence=seq,
                               rule=PYGPATK_ENZYMES.enzymes[self._enzyme]
                               ['cleavage rule'],
                               missed_cleavages=0,
                               min_length=self._min_peptide_length))

                    # reverse and switch protein sequence
                    decoyseq = self.revswitch(
                        seq, self._no_switch, PYGPATK_ENZYMES.enzymes[
                            self._enzyme]['cleavage sites'])

                    # do not store decoy peptide set in reduced memory mode
                    if not self._memory_save:
                        # update decoy peptide set
                        dpeps.update(
                            cleave(sequence=decoyseq,
                                   rule=PYGPATK_ENZYMES.enzymes[self._enzyme]
                                   ['cleavage rule'],
                                   missed_cleavages=0,
                                   min_length=self._min_peptide_length))

                    # write decoy protein accession and sequence to file
                    outfa.write('>' + self._decoy_prefix + description + '\n')
                    outfa.write(decoyseq + '\n')

        # Summarise the numbers of target and decoy peptides and their intersection
        nonDecoys = set()
        print("proteins:" + str(dcount))
        print("target peptides:" + str(len(upeps)))

        # Reloop decoy file in reduced memory mode to store only intersecting decoys
        if self._memory_save:
            # open temp decoys
            with open(self._temp_file, "rt") as fin:
                for line in fin:
                    # if line is not accession
                    if line[0] != '>':
                        # digest protein
                        for p in cleave(sequence=line.rstrip(),
                                        rule=PYGPATK_ENZYMES.enzymes[
                                            self._enzyme]['cleavage rule'],
                                        missed_cleavages=0,
                                        min_length=self._min_peptide_length):
                            # check if in target peptides if true then add to nonDecoys
                            if p in upeps:
                                nonDecoys.add(p)
            fin.close()
            print("decoy peptides: !Memory Saving Made!")
        else:
            # can only report total number in normal memory mode
            print("decoy peptides:" + str(len(dpeps)))
            # find intersecting peptides
            nonDecoys = upeps.intersection(dpeps)

        print("#intersection:" + str(len(nonDecoys)))

        # if there are decoy peptides that are in the target peptide set
        if len(nonDecoys) > 0 and self._no_suffle == False:

            # create empty dictionary with bad decoys as keys
            dAlternative = dict.fromkeys(nonDecoys, '')
            noAlternative = list()

            # loop bad decoys / dictionary keys
            for dPep in dAlternative:
                i = 0
                aPep = dPep

                # shuffle until aPep is not in target set (maximum of 10 iterations)
                while aPep in upeps and i < self._max_iterations:

                    # increment iteration counter
                    i += 1

                    # shuffle peptide
                    aPep = self.shuffle(dPep)

                    # check if shuffling has an effect if not end iterations
                    if aPep == dPep:
                        i = self._max_iterations

                # update dictionary with alternative shuffled peptide
                dAlternative[dPep] = aPep

                # warn if peptide has no suitable alternative, add to removal list
                if i == self._max_iterations:
                    noAlternative.append(dPep)

            print(str(len(noAlternative)) + ' have no alternative peptide')
            # remove peptides with no alternative
            for p in noAlternative:
                del dAlternative[p]

            # Free up memory by clearing large sets of peptides
            upeps.clear()
            dpeps.clear()

            # open second decoy file
            with open(self._output_file, "wt") as fout:

                # Attach the target sequences to the database
                # fasta = SeqIO.parse(self._input_fasta, 'fasta')
                with open(self._input_fasta) as handle:
                    for value in SimpleFastaParser(handle):
                        description = value[0]
                        seq = value[1]
                        fout.write('>' + description + '\n')
                        fout.write(seq + '\n')

                # open original decoy file
                with open(self._temp_file, "rt") as fin:
                    # loop each line of original decoy fasta
                    for line in fin:
                        # if line is not accession replace peptides in dictionary with alternatives
                        if line[0] != '>':
                            # digest decoy sequence
                            for p in cleave(
                                    sequence=line.rstrip(),
                                    rule=PYGPATK_ENZYMES.enzymes[
                                        self._enzyme]['cleavage rule'],
                                    missed_cleavages=0,
                                    min_length=self._min_peptide_length):
                                # store decoy peptide for final count
                                dpeps.add(p)

                                # if decoy peptide is in dictionary replace with alternative
                                if p in dAlternative:
                                    line = line.replace(p, dAlternative[p])

                        fout.write(line)
                fin.close()
            fout.close()

            # delete temporary file
            os.remove(self._temp_file)
        else:
            os.rename(self._temp_file, self._output_file)

        print("final decoy peptides:" + str(len(dpeps)))
Exemple #16
0
def digest(prot):
    # print(prot)
    peps = list(parser.cleave(prot, parser.expasy_rules["trypsin"], missed_cleavages = 0, min_length = MIN_LENGTH_OF_PEP))
    peps = [pep for pep in peps if len(pep) <= MAX_LENGTH_OF_PEP]
    return peps
Exemple #17
0
arg_parser.add_argument('-v', '--verbosity', action='count',
                        help='increase output verbosity')

args = arg_parser.parse_args()

# TODO: Do it proper way - using os.path
out_file = args.fasta_file + '.peptides'                        
peptides = []

with fasta.read(args.fasta_file) as reader, open(out_file,'w') as writer:
    
    # Build a set of peptides for each fasta sequence 
    if args.verbosity >= 1:
        print 'Building digests...'
    for description, sequence in reader:
        peps = parser.cleave(sequence, parser.expasy_rules[args.enz], args.missed)
        peps = [x for x in peps if len(x) > args.min]
        writer.write('Peptides for {seq} ({enz} cleavage)\n'.format(
            seq=description, enz=args.enz))
        writer.write('...\t{n} missed cleavages\n'.format(n=args.missed))
        writer.write('\n'.join(peps)+'\n')
        peptides.append(set(peps))
        if args.verbosity >= 2:
            print '...\t{n} peptides for {prot}'.format(n=len(peps),prot=description)
        
    # Identify unique peptides for each fasta sequence
    if args.verbosity >= 1:
        print 'Finding unique peptides...'
    for peps in peptides:
        rest = [x for x in peptides if x is not peps]
        unique = peps - set().union(*rest)
Exemple #18
0
def digest_protein(protein=None):
    digest = parser.cleave(protein, '[KR]', missed_cleavages=3, min_length=6)
    return digest
def trypsin(aa):
    return parser.cleave(aa,
                         parser.expasy_rules['trypsin'],
                         min_length=7,
                         missed_cleavages=1)