def build_db(self): """TODO(nh2tran): docstring.""" print("".join(["="] * 80)) # section-separating line print("WorkerDB.build_db()") # parse the input fasta file into a list of sequences # more about SeqIO and SeqRecord: http://biopython.org/wiki/SeqRecord with open(self.db_fasta_file, "r") as handle: record_iterator = SeqIO.parse(handle, "fasta") record_list = list(record_iterator) sequence_list = [str(record.seq) for record in record_list] print("Number of protein sequences: {0:d}".format( len(sequence_list))) # cleave protein sequences into a list of unique peptides # more about pyteomics.parser.cleave and cleavage rules: # https://pythonhosted.org/pyteomics/api/parser.html peptide_set = set() for sequence in sequence_list: peptide_set.update( (parser.cleave(sequence=sequence, rule=parser.expasy_rules[self.cleavage_rule], missed_cleavages=self.num_missed_cleavage))) peptide_list = list(peptide_set) # skip peptides with undetermined amino acid 'X', or 'B' peptide_list = [ list(peptide) for peptide in peptide_list if not ('X' in peptide or 'B' in peptide) ] peptide_count = len(peptide_list) print("Number of peptides: {0:d}".format(peptide_count)) # replace "L" by "I" for index, peptide in enumerate(peptide_list): peptide = ['I' if x == 'L' else x for x in peptide] peptide_list[index] = peptide # update fixed modifications for index, peptide in enumerate(peptide_list): peptide = [ x + 'mod' if x in self.fixed_mod_list else x for x in peptide ] peptide_list[index] = peptide # for each peptide, find the mass and the max modification mass peptide_mass_array = np.zeros(peptide_count) pepmod_maxmass_array = np.zeros(peptide_count) for index, peptide in enumerate(peptide_list): peptide_mass_array[index] = self._compute_peptide_mass(peptide) pepmod = [ x + 'mod' if x in self.var_mod_list else x for x in peptide ] pepmod_maxmass_array[index] = self._compute_peptide_mass(pepmod) self.peptide_count = peptide_count self.peptide_list = peptide_list self.peptide_mass_array = peptide_mass_array self.pepmod_maxmass_array = pepmod_maxmass_array
def cleave_peptide(db,inp): dict_of_sequences={} for i in SeqIO.parse(open(db), "fasta"): sequence = str(i.seq) peptides = parser.cleave(sequence,parser.expasy_rules['trypsin']) size = [j for j in peptides if (len(j)>6 and len(j)<=35)] dict_of_sequences[i.id.split("|")[1]]=len(size) summed = {} for j1 in open(inp): if not j1.startswith("Sequence"): peptide, accession, intensity = j1.strip().split("\t") for k in accession.split(";"): if k in summed: summed[k] += float(intensity) else: summed[k]=0 summed[k] += float(intensity) for each in summed: lengths = dict_of_sequences.get(each,0) if lengths==0: ibaq=0 else: ibaq = float(summed[each])/lengths try: print "\t".join([str(each), str(ibaq),str(math.log10(ibaq))]) except: print "\t".join([str(each), str(ibaq),"0"])
def build_db(self): """TODO(nh2tran): docstring.""" print("".join(["="] * 80)) # section-separating line print("WorkerDB: build_db()") # parse the input fasta file into a list of sequences # more about SeqIO and SeqRecord: http://biopython.org/wiki/SeqRecord with open(self.db_fasta_file, "r") as handle: record_iterator = SeqIO.parse(handle, "fasta") record_list = list(record_iterator) print("Number of protein sequences: {0:d}".format(len(record_list))) # cleave protein sequences into a list of unique peptides # more about pyteomics.parser.cleave and cleavage rules: # https://pythonhosted.org/pyteomics/api/parser.html # create a peptide to protein accession id map. peptide_2_protein_id = {} for record in record_list: protein_sequence = str(record.seq) protein_id = str(record.id) cleaved_peptide_set = parser.cleave( sequence=protein_sequence, rule=parser.expasy_rules[self.cleavage_rule], missed_cleavages=self.num_missed_cleavage) for peptide in cleaved_peptide_set: if any(x in peptide for x in ['X', 'B', 'U', 'Z']): # skip peptides with undetermined amino acid ['X', 'B', 'U', 'Z'] continue if peptide not in peptide_2_protein_id: peptide_2_protein_id[peptide] = {protein_id} else: peptide_2_protein_id[peptide].add(protein_id) peptide_list = [list(peptide) for peptide in peptide_2_protein_id.keys()] peptide_list = [[x + 'mod' if x in self.fixed_mod_list else x for x in peptide] for peptide in peptide_list ] peptide_count = len(peptide_list) print("Number of peptides: {0:d}".format(peptide_count)) # for each peptide, find the mass and the max modification mass peptide_mass_array = np.zeros(peptide_count) pepmod_maxmass_array = np.zeros(peptide_count) for index, peptide in enumerate(peptide_list): peptide_mass_array[index] = self._compute_peptide_mass(peptide) pepmod = [x + 'mod' if x in self.var_mod_list else x for x in peptide] pepmod_maxmass_array[index] = self._compute_peptide_mass(pepmod) self.peptide_count = peptide_count self.peptide_list = peptide_list self.peptide_mass_array = peptide_mass_array self.pepmod_maxmass_array = pepmod_maxmass_array self.peptide_2_protein_id = peptide_2_protein_id
def print_target_decoy_composition(self): """ Print the number of target peptides vs decoy peptides in a Fasta database :return: """ target_aa_composition = {i: 0 for i in [aa for aa in PYGPATK_ALPHABET]} decoy_aa_composition = {i: 0 for i in [aa for aa in PYGPATK_ALPHABET]} target_sequence = '' decoy_sequence = '' fasta = SeqIO.parse(self._output_file, 'fasta') target_peptides = {} decoy_peptides = {} pep_count_in_both = 0 for record in fasta: peptides = cleave( sequence=str(record.seq), rule=PYGPATK_ENZYMES.enzymes[self._enzyme]['cleavage rule'], missed_cleavages=self._max_missed_cleavages, min_length=self._min_peptide_length) if self._decoy_prefix in record.id: decoy_sequence = decoy_sequence + str(record.seq) else: target_sequence = target_sequence + str(record.seq) for peptide in peptides: if self._decoy_prefix in record.id: decoy_peptides[peptide] = 'decoy' if peptide in target_peptides: target_peptides.pop(peptide) pep_count_in_both += 1 else: if peptide not in decoy_peptides: target_peptides[peptide] = 'target' print('Number of target peptides: {} and Decoy Peptides: {}'.format( len(target_peptides), len(decoy_peptides))) target_percentage = ( len(target_peptides) / (len(target_peptides) + len(decoy_peptides))) * 100 print('% Target peptides {:.1f}'.format(target_percentage)) decoy_percentage = (len(decoy_peptides) / (len(target_peptides) + len(decoy_peptides))) * 100 print('% Decoy peptides {:.1f}'.format(decoy_percentage)) duplicate_percentage = ( pep_count_in_both / (len(target_peptides) + len(decoy_peptides))) * 100 print('Number of peptides in Target and Decoy {}, Percentage {:.1f}'. format(pep_count_in_both, duplicate_percentage)) target_aa_composition = self.count_aa_in_dictionary( target_aa_composition, target_sequence) decoy_aa_composition = self.count_aa_in_dictionary( decoy_aa_composition, decoy_sequence) self.print_aa_composition_rate(target_aa_composition, decoy_aa_composition)
def digetsProteinFromFASTA(): sequenceIter = fasta.read(source=options.fasta) uniquePeptides = set() for s in sequenceIter: newPeptides = parser.cleave(s.sequence, 'trypsin', missed_cleavages=options.missed, min_length=options.minLength) uniquePeptides.update(newPeptides) uniquePeptides = list(uniquePeptides) return [Peptide(x) for x in uniquePeptides]
def digest(prot): # print(prot) dtype = [('seq', np.unicode_, MAX_LENGTH_OF_PEP), ('pcMass', float)] pepsWithMass = np.array([], dtype) peps = list( parser.cleave(prot, parser.expasy_rules["trypsin"], missed_cleavages=MISSED_CLEAVAGES, min_length=MIN_LENGTH_OF_PEP)) # if 'MESYHKPDQQK' in peps: # print(peps) # if len(peps) != 0: # firstPep = peps[0] # if firstPep == 'MESYHKPDQQK': # print(firstPep) # nTermPep = firstPep[0].lower() + firstPep[1:] # peps.append(nTermPep) nTermPeps = [] oxMPeps = [] for pep in peps: # print(pep) # a = prot.index(pep) # if pep not in prot: # print(pep) if prot.index(pep) == 0: nTermPep = pep[0].lower() + pep[1:] nTermPeps.append(nTermPep) elif 'M' in pep: indexs = [M.start() for M in re.finditer('M', pep)] for i in indexs: oxMPep = pep[0:i] + 'm' + pep[i + 1:] oxMPeps.append(oxMPep) peps.extend(nTermPeps) peps.extend(oxMPeps) peps = [ pep for pep in peps if (len(pep) <= MAX_LENGTH_OF_PEP and 'B' not in pep and 'J' not in pep and 'X' not in pep and 'Z' not in pep and 'O' not in pep and 'U' not in pep) ] pepsWithMass = np.array([tuple([pep, calcuSeqMass(pep)]) for pep in peps], dtype) return pepsWithMass
def test_cleave_semi(self): self.assertEqual( parser._cleave('PEPTIDEKS', parser.expasy_rules['trypsin'], semi=True), [ 'PEPTIDEK', 'P', 'PE', 'PEP', 'PEPT', 'PEPTI', 'PEPTID', 'EPTIDEK', 'PTIDEK', 'TIDEK', 'IDEK', 'DEK', 'EK', 'K', 'S' ]) self.assertEqual( parser.cleave('PEPTIDEKS', parser.expasy_rules['trypsin'], semi=True), { 'PEPTIDEK', 'P', 'PE', 'PEP', 'PEPT', 'PEPTI', 'PEPTID', 'EPTIDEK', 'PTIDEK', 'TIDEK', 'IDEK', 'DEK', 'EK', 'K', 'S' })
def digest(prot): peps = list( parser.cleave(prot, parser.expasy_rules["trypsin"], missed_cleavages=MISSED_CLEAVAGES, min_length=MIN_LENGTH_OF_PEP)) peps = [ pep for pep in peps if (len(pep) <= MAX_LENGTH_OF_PEP and 'B' not in pep and 'J' not in pep and 'X' not in pep and 'Z' not in pep and 'O' not in pep and 'U' not in pep) ] dropMPeps = [] variableModPeps = [] anyNtermQGlu2PyrogluPeps = [] protNtermAcetylPeps = [] # m Drop M for pep in peps: if pep[0] == 'M' and prot.find(pep) == 0: dropMPeps.append(pep[1:]) peps.extend(dropMPeps) # prot nTerm Acetyl for pep in peps: if prot.find(pep) == 0: protNtermAcetylPeps.append('5' + pep) for pep in dropMPeps: protNtermAcetylPeps.append('5' + pep) # peps.extend(protNtermAcetylPeps) # pep nterm Q to pyro Q acid for pep in peps: if pep[0] == 'Q' and prot.find(pep) != 0: anyNtermQGlu2PyrogluPeps.append('q' + pep[1:]) peps.extend(anyNtermQGlu2PyrogluPeps) # variable mod M oxidi. for pep in peps: variableModPeps.extend(getVariableModPeps('M', 'm', pep)) peps.extend(variableModPeps) return peps
def digest_proteins(fasta_file): protein_df = pd.DataFrame() # Initialize a dataframe to store results counter = 0 for protein in tqdm(SeqIO.parse(fasta_file, "fasta")): protein_sequence = str(protein.seq).upper() # cleave initial "start" methionine if present if protein_sequence[0] == "M": protein_sequence = protein_sequence[1:] # digest proteins into peptides following Arg-C cleavage (r'R') # glu-c == r'[DE]' # peptides = list(parser.cleave(protein_sequence, parser.expasy_rules['arg-c'])) peptides = list(parser.cleave(protein_sequence, '[DE]')) # get the aa residue positions for the peptides to later link residue PTM positions to the peptide initial_index = [] terminal_index = [] for pep in peptides: initial_aa_pos = 1 + int(protein_sequence.index( pep)) # +1 for zero-based array indexing terminal_aa_pos = initial_aa_pos + len(pep) initial_index.append(initial_aa_pos) terminal_index.append(terminal_aa_pos) # add this protein to the dataframe new_df = pd.DataFrame({ 'protein': [protein.id] * len(peptides), 'peptide_sequence': peptides, 'initial_aa_index': initial_index, 'terminal_aa_index': terminal_index }) protein_df = protein_df.append(new_df) protein_df = protein_df[ protein_df['peptide_sequence'].notna()] # drop any nan peptides(?) protein_df = protein_df[protein_df['peptide_sequence'].str.len() >= 4] # drop peptides <4 aa protein_df = protein_df[protein_df['peptide_sequence'].str.len() <= 40] # drop peptides >40 aa sys.stdout.write("Finished in silico digestion and generating peptides.\n") return (protein_df)
def read_fasta_sequences(fasta_file): """ Read sequence records from a FASTA file. """ sequence_records = [] for description, sequence in fasta.read(fasta_file): # Initialize sequence record with sequence string. sequence_record = {'sequence': sequence} # Get sequence info. description_parts = description.split() sequence_record['id'] = description_parts[0] # Get the sequence's peptides. sequence_record['peptides'] = parser.cleave( sequence, parser.expasy_rules['trypsin'], missed_cleavages=1 #max no. of missed cleavages. ) # Save the sequence record, keyed by the id. sequence_records.append(sequence_record) return sequence_records
def prot_to_peprec(protein): params = get_params() tmp = pd.DataFrame( columns=['spec_id', 'peptide', 'modifications', 'charge']) pep_count = 0 for peptide in cleave(str(protein.seq), expasy_rules['trypsin'], params['missed_cleavages']): if False not in [ aa not in peptide for aa in ['B', 'J', 'O', 'U', 'X', 'Z'] ]: if params['min_peplen'] <= len(peptide) < int( params['max_pepmass'] / 186 + 2): if not mass.calculate_mass( sequence=peptide) > params['max_pepmass']: pep_count += 1 row = { 'spec_id': '{}_{:03d}'.format(protein.id, pep_count), 'peptide': peptide, 'modifications': '-', 'charge': np.nan } tmp = tmp.append(row, ignore_index=True) return tmp
geneError += 1 gError.append(identifier) match4 = re.search(r'\\PE=(.+?) \\', record.description) if match4: pe = match4.group(1) else: pe = '' print(f'WARNING: Unable to find PE value in {record.description}') match5 = re.search(r'\((.{0,5})\|(.{0,5})\|PEFF:\d+\|mature protein\)', record.description) x = len( re.findall(r'\((.{0,5})\|(.{0,5})\|PEFF:\d+\|mature protein\)', record.description)) if x > 1: mature = fixed_sequence peptides = parser.cleave(mature, 'trypsin') for peptide in peptides: if 9 <= len(peptide) <= 40: trypnum += 1 if match5: if match5.group(1) == '?' or match5.group(2) == '?': matureUnknown += 1 mUnknown.append(identifier) mature = fixed_sequence peptides = parser.cleave(mature, 'trypsin') for peptide in peptides: if 9 <= len(peptide) <= 40: trypnum += 1 else: start = int(match5.group(1)) end = int(match5.group(2))
""" Created on Fri Mar 22 11:28:43 2013 @author: ilya """ from pyteomics import fasta, mgf, parser import pylab fasta_file = '/home/ilya/src/pyteomics/RhoEcoli.fasta' mgf_file = '/home/ilya/src/pyteomics/MultiConsensus.mgf' peptides = set() with open(fasta_file) as fi: for description, sequence in fasta.read(fi): new_peptides = parser.cleave(sequence, parser.expasy_rules['trypsin']) peptides.update(new_peptides) print "UNIQUE PEPTIDES" print peptides with open(mgf_file) as fi: for spectrum in mgf.read(fi): pylab.figure() pylab.xlabel('m/z, Th') pylab.ylabel('Intensity, rel.units') pylab.bar(spectrum['m/z array'], spectrum['intensity array'], width=0.1, linewidth=2, edgecolor='black') pylab.show() inp = raw_input("Show more?") if inp != "yes": break;
def pypgatk_decoy_database(self): """ Create a decoy database from a proteomics database target db is digested and only digested peptides > _peptide_length are kept next, each target protein is reversed and digested, all peptides are kept regardless of their length. The list of digested peptides from the reversed protein are iterated: - small peptides are kept (len < _peptide_length) - peptides not found in target are kep - peptides with a match are shuffled for max_iterations, if a non-target peptide was found then written otherwise the peptide is skipped unless the _keep_target_hits option is true. :return: """ # Create empty sets to add all target and decoy peptides upeps = set() noAlternative = set() # Open FASTA file using first cmd line argument fasta = SeqIO.parse(self._input_fasta, 'fasta') # loop each seq in the file for record in fasta: seq = str(record.seq) if not self._isobaric: seq = seq.replace('I', 'L') # digest sequence add peptides to the target set upeps.update( cleave(sequence=seq, rule=PYGPATK_ENZYMES.enzymes[self._enzyme] ['cleavage rule'], missed_cleavages=self._max_missed_cleavages, min_length=self._min_peptide_length)) # open orary decoy FASTA file with open(self._output_file, 'w') as outfa: fasta = SeqIO.parse(self._input_fasta, 'fasta') targets = [] decoys = [] for i, record in enumerate(fasta): protseq = str(record.seq) targets.append(protseq) revprotseq = [] # output target protein seq = str(record.seq) id_protein = record.id description = record.description outfa.write('>' + id_protein + ' ' + description + '\n') outfa.write(seq + '\n') for seq in protseq.split('*'): if not seq: continue if not self._isobaric: seq = seq.replace('I', 'L') # reverse and switch protein sequence decoyseq = self.revswitch( seq, self._no_switch, PYGPATK_ENZYMES.enzymes[ self._enzyme]['cleavage sites']) decoy_peps = cleave(sequence=decoyseq, rule=PYGPATK_ENZYMES.enzymes[ self._enzyme]['cleavage rule'], missed_cleavages=0, min_length=0) # if any of the digested peptides are found in the targets (upeps) then shuffle checked_decoy_peps = [] for decoy_pep in decoy_peps: if len(decoy_pep) < self._min_peptide_length: checked_decoy_peps.append(decoy_pep) continue found_in_target = False aPep = '' if decoy_pep in upeps: found_in_target = True else: checked_decoy_peps.append(decoy_pep) continue if found_in_target and not self._no_suffle and decoy_pep not in noAlternative: aPep = decoy_pep # shuffle until aPep is not in target set (maximum of 10 iterations) i = 0 while aPep in upeps and i < self._max_iterations: # increment iteration counter i += 1 # shuffle peptide aPep = self.shuffle(aPep) # check if shuffling has an effect if not end iterations if aPep == decoy_pep: i = self._max_iterations # warn if peptide has no suitable alternative, add to removal list if i == self._max_iterations: noAlternative.add(decoy_pep) aPep = '' # if decoy is generated then add to the list of peptides if aPep: checked_decoy_peps.append(aPep) else: if self._keep_target_hits: checked_decoy_peps.append(decoy_pep) # finally join the peptides to generate protein decoy if checked_decoy_peps: revprotseq.append(''.join(checked_decoy_peps)) outfa.write('>{}\n{}\n'.format( self._decoy_prefix + str(record.id) + ' ' + record.description, '*'.join(revprotseq))) decoys.append('*'.join(revprotseq)) with open( self._output_file.replace('.fa', '') + '_noAlternative.fa', 'w') as noAlternative_outfa: noAlternative_outfa.write('\n'.join(noAlternative) + '\n') print( 'Number of skipped tryptic peptides in decoy db (no alternatives): {}' .format(len(noAlternative))) print( 'Total number of amino acids in target and decoy databases: ', len(''.join(targets)), len(''.join(decoys)))
def generate_decoypyrat_database(self): """ Create a decoy database from a proteomics database this method is presented in manuscript: J Proteomics Bioinform. 2016 Jun 27; 9(6): 176–180. PMCID: PMC4941923 DecoyPyrat: Fast Non-redundant Hybrid Decoy Sequence Generation for Large Scale Proteomics :return: """ # Create empty sets to add all target and decoy peptides upeps = set() dpeps = set() # Counter for number of decoy sequences dcount = 0 # Open FASTA file using first cmd line argument # fasta = SeqIO.parse(self._input_fasta, 'fasta') with open(self._input_fasta) as handle: # open temporary decoy FASTA file with open(self._temp_file, 'w') as outfa: # loop each seq in the file for value in SimpleFastaParser(handle): seq = value[1] description = value[0] dcount += 1 # make sequence isobaric (check args for switch off) if not self._isobaric: seq = seq.replace('I', 'L') # digest sequence add peptides to set upeps.update( cleave(sequence=seq, rule=PYGPATK_ENZYMES.enzymes[self._enzyme] ['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length)) # reverse and switch protein sequence decoyseq = self.revswitch( seq, self._no_switch, PYGPATK_ENZYMES.enzymes[ self._enzyme]['cleavage sites']) # do not store decoy peptide set in reduced memory mode if not self._memory_save: # update decoy peptide set dpeps.update( cleave(sequence=decoyseq, rule=PYGPATK_ENZYMES.enzymes[self._enzyme] ['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length)) # write decoy protein accession and sequence to file outfa.write('>' + self._decoy_prefix + description + '\n') outfa.write(decoyseq + '\n') # Summarise the numbers of target and decoy peptides and their intersection nonDecoys = set() print("proteins:" + str(dcount)) print("target peptides:" + str(len(upeps))) # Reloop decoy file in reduced memory mode to store only intersecting decoys if self._memory_save: # open temp decoys with open(self._temp_file, "rt") as fin: for line in fin: # if line is not accession if line[0] != '>': # digest protein for p in cleave(sequence=line.rstrip(), rule=PYGPATK_ENZYMES.enzymes[ self._enzyme]['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length): # check if in target peptides if true then add to nonDecoys if p in upeps: nonDecoys.add(p) fin.close() print("decoy peptides: !Memory Saving Made!") else: # can only report total number in normal memory mode print("decoy peptides:" + str(len(dpeps))) # find intersecting peptides nonDecoys = upeps.intersection(dpeps) print("#intersection:" + str(len(nonDecoys))) # if there are decoy peptides that are in the target peptide set if len(nonDecoys) > 0 and self._no_suffle == False: # create empty dictionary with bad decoys as keys dAlternative = dict.fromkeys(nonDecoys, '') noAlternative = list() # loop bad decoys / dictionary keys for dPep in dAlternative: i = 0 aPep = dPep # shuffle until aPep is not in target set (maximum of 10 iterations) while aPep in upeps and i < self._max_iterations: # increment iteration counter i += 1 # shuffle peptide aPep = self.shuffle(dPep) # check if shuffling has an effect if not end iterations if aPep == dPep: i = self._max_iterations # update dictionary with alternative shuffled peptide dAlternative[dPep] = aPep # warn if peptide has no suitable alternative, add to removal list if i == self._max_iterations: noAlternative.append(dPep) print(str(len(noAlternative)) + ' have no alternative peptide') # remove peptides with no alternative for p in noAlternative: del dAlternative[p] # Free up memory by clearing large sets of peptides upeps.clear() dpeps.clear() # open second decoy file with open(self._output_file, "wt") as fout: # Attach the target sequences to the database # fasta = SeqIO.parse(self._input_fasta, 'fasta') with open(self._input_fasta) as handle: for value in SimpleFastaParser(handle): description = value[0] seq = value[1] fout.write('>' + description + '\n') fout.write(seq + '\n') # open original decoy file with open(self._temp_file, "rt") as fin: # loop each line of original decoy fasta for line in fin: # if line is not accession replace peptides in dictionary with alternatives if line[0] != '>': # digest decoy sequence for p in cleave( sequence=line.rstrip(), rule=PYGPATK_ENZYMES.enzymes[ self._enzyme]['cleavage rule'], missed_cleavages=0, min_length=self._min_peptide_length): # store decoy peptide for final count dpeps.add(p) # if decoy peptide is in dictionary replace with alternative if p in dAlternative: line = line.replace(p, dAlternative[p]) fout.write(line) fin.close() fout.close() # delete temporary file os.remove(self._temp_file) else: os.rename(self._temp_file, self._output_file) print("final decoy peptides:" + str(len(dpeps)))
def digest(prot): # print(prot) peps = list(parser.cleave(prot, parser.expasy_rules["trypsin"], missed_cleavages = 0, min_length = MIN_LENGTH_OF_PEP)) peps = [pep for pep in peps if len(pep) <= MAX_LENGTH_OF_PEP] return peps
arg_parser.add_argument('-v', '--verbosity', action='count', help='increase output verbosity') args = arg_parser.parse_args() # TODO: Do it proper way - using os.path out_file = args.fasta_file + '.peptides' peptides = [] with fasta.read(args.fasta_file) as reader, open(out_file,'w') as writer: # Build a set of peptides for each fasta sequence if args.verbosity >= 1: print 'Building digests...' for description, sequence in reader: peps = parser.cleave(sequence, parser.expasy_rules[args.enz], args.missed) peps = [x for x in peps if len(x) > args.min] writer.write('Peptides for {seq} ({enz} cleavage)\n'.format( seq=description, enz=args.enz)) writer.write('...\t{n} missed cleavages\n'.format(n=args.missed)) writer.write('\n'.join(peps)+'\n') peptides.append(set(peps)) if args.verbosity >= 2: print '...\t{n} peptides for {prot}'.format(n=len(peps),prot=description) # Identify unique peptides for each fasta sequence if args.verbosity >= 1: print 'Finding unique peptides...' for peps in peptides: rest = [x for x in peptides if x is not peps] unique = peps - set().union(*rest)
def digest_protein(protein=None): digest = parser.cleave(protein, '[KR]', missed_cleavages=3, min_length=6) return digest
def trypsin(aa): return parser.cleave(aa, parser.expasy_rules['trypsin'], min_length=7, missed_cleavages=1)