def test_tostring(self): for seq in self.simple_sequences: self.assertEqual( seq, parser.tostring(parser.parse(seq, labels=uppercase))) self.assertEqual( seq, parser.tostring( parser.parse(seq, True, True, labels=uppercase), False))
def test_parse(self): self.assertEqual( [('P',), ('E',), ('P',), ('T',), ('I',), ('D',), ('E',)], parser.parse('PEPTIDE', split=True)) self.assertEqual(['P', 'E', 'P', 'T', 'I', 'D', 'E'], parser.parse('H-PEPTIDE')) for seq in ['PEPTIDE', 'H-PEPTIDE', 'PEPTIDE-OH', 'H-PEPTIDE-OH']: self.assertEqual(['H-', 'P', 'E', 'P', 'T', 'I', 'D', 'E', '-OH'], parser.parse(seq, show_unmodified_termini=True)) self.assertEqual(['T', 'E', 'pS', 'T', 'oxM'], parser.parse('TEpSToxM', labels=parser.std_labels + ['pS', 'oxM'])) self.assertEqual( [('H-', 'z', 'P'), ('E',), ('P',), ('z', 'T'), ('I',), ('D',), ('z', 'E', '-OH')], parser.parse('zPEPzTIDzE', True, True, labels=parser.std_labels + ['z']))
def calc_precursor_theoretical(seq, z): try: parseq = parser.parse(seqModX(seq), labels=modLabels, show_unmodified_termini=True) theomass = mass.calculate_mass(parsed_sequence=parseq, aa_comp=composition) theomz = mass.calculate_mass(parsed_sequence=parseq, aa_comp=composition, charge=z) return (parseq, theomass, theomz) except : return (None, None, None)
def test_isotopologues(self): peptide = 'XYF' states = [{ 'F[6]': 1, 'A': 1, 'B': 1, 'D': 1, 'E': 1 }, { 'F[7]': 1, 'A': 1, 'B': 1, 'D': 1, 'E': 1 }] abundances = [0.7, 0.3] kw_common = dict(elements_with_isotopes='F', aa_comp=self.aa_comp, mass_data=self.mass_data) kwlist = [{}, { 'sequence': 'XYF' }, { 'parsed_sequence': parser.parse('XYF', show_unmodified_termini=True) }, { 'split_sequence': parser.parse('XYF', show_unmodified_termini=True, split=True) }, { 'formula': 'ABDEF' }, { 'composition': mass.Composition(sequence='XYF', aa_comp=self.aa_comp) }] arglist = [(peptide, ), (), (), (), (), ()] for args, kw in zip(arglist, kwlist): kwargs = kw_common.copy() kwargs.update(kw) isotopologues = mass.isotopologues(*args, **kwargs) for state in isotopologues: i = states.index(state) self.assertNotEqual(i, -1) self.assertAlmostEqual( abundances[i], mass.isotopic_composition_abundance( state, aa_comp=self.aa_comp, mass_data=self.mass_data))
def test_isoforms_maxmods(self): for j in range(50): L = random.randint(1, 10) M = random.randint(1, 10) peptide = ''.join([random.choice(self.labels) for _ in range(L)]) modseqs = parser.isoforms(peptide, variable_mods=self.potential, labels=self.labels, max_mods=M, format='split') pp = parser.parse(peptide, labels=self.extlabels, split=True) for ms in modseqs: self.assertEqual(len(pp), len(ms)) self.assertLessEqual(sum(i != j for i, j in zip(pp, ms)), M)
def get_peptide_data(peptide): """ Get data for a given peptide. """ peptide_data = {'sequence': peptide} peptide_data['parsed_sequence'] = parser.parse( peptide, show_unmodified_termini=True # keep the termini, for mass calculations. ) peptide_data['mass'] = mass.calculate_mass( peptide_data['parsed_sequence'] ) return peptide_data
def apply_var_mods(seq, mods): parsed = parser.parse(seq) out = [] for i, aa in enumerate(parsed): if i in mods: out.append('{{{:+.0f}}}'.format(mods[i]) + aa) else: out.append(aa) seqout = ''.join(out) internal('%s + %s = %s', seq, mods, seqout) return seqout
def test_isoforms_len(self): for j in range(50): L = random.randint(1, 10) peptide = ''.join(random.choice(self.labels) for _ in range(L)) modseqs = list(parser.isoforms(peptide, variable_mods=self.potential, fixed_mods=self.constant, labels=self.labels)) pp = parser.parse(peptide, labels=self.extlabels) N = (pp[0] == 'N') + (pp[-1] == 'C') for p in modseqs: self.assertEqual(len(pp), parser.length(p, labels=self.extlabels)) self.assertEqual(len(modseqs), (3 ** pp.count('A')) * (2 ** (pp.count('X') + pp.count('C') + N)))
def calc_precursor_theoretical(seq, z): try: parseq = parser.parse(seqModX(seq), labels=modLabels, show_unmodified_termini=True) theomass = mass.calculate_mass(parsed_sequence=parseq, aa_comp=composition) theomz = mass.calculate_mass(parsed_sequence=parseq, aa_comp=composition, charge=z) return (parseq, theomass, theomz) except: return (None, None, None)
def _get_theoretical_peptide_fragments(peptide: str, types: str = 'by', max_charge: int = 1): """ Get theoretical fragments for the given peptide. Parameters ---------- peptide : str The peptide sequence for which the fragments will be generated. types : str, optional The fragment type. Can be any combination of 'a', 'b', 'c', 'x', 'y', and 'z' (the default is 'by', which means that b-ions and y-ions will be generated). max_charge : int, optional All fragments up to and including the given charge will be generated (the default is 1 to only generate singly-charged fragments). Returns ------- A list of all fragments as (`FragmentAnnotation`, m/z) tuples sorted in ascending m/z order. """ ions = [] amino_acids = parser.parse(peptide) for i in range(1, len(amino_acids)): for ion_type in types: for charge in range(1, max_charge + 1): if ion_type in 'abc': ions.append(( FragmentAnnotation(ion_type, i, charge), mass.calculate_mass(sequence=''.join(amino_acids[:i]), ion_type=ion_type, charge=charge))) else: ions.append(( FragmentAnnotation(ion_type, len(peptide) - i, charge), mass.calculate_mass(sequence=''.join(amino_acids[i:]), ion_type=ion_type, charge=charge))) return sorted(ions, key=operator.itemgetter(1))
def prepare_libraries(sequence, **kwargs): kw = ("static", "variable", "Ytype") ignore = {"FALSE", ""} labels = parser.std_labels[:] mod_mass = dict(mass.std_aa_mass) for k in kw: if k in kwargs: for m in range(len(kwargs[k])): labels.append(kwargs[k][m]["label"]) mod_mass[kwargs[k][m]["label"]] = kwargs[k][m]["mass"] if kwargs[k][m]["auto_allocation"] not in ignore: reg = re.compile(kwargs[k][m]["regex"]) if "positions" not in kwargs[k][m]: kwargs[k][m]["positions"] = [] for match in reg.finditer(sequence): kwargs[k][m]["positions"].append(match.start()) return labels, mod_mass, parser.parse(sequence, labels=labels, split=True)
# -*- coding: utf-8 -*- """ Created on Wed Feb 27 20:56:42 2019 @author: bjwil """ import pyteomics from pyteomics import parser parser.is_modX('pTx') parser.is_modX('K') parser.parse('AcPEPTIDE', split=True)
'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/' 'knowledgebase/proteomes/YEAST.fasta.gz', 'yeast.fasta.gz') print 'Done!' print 'Cleaving the proteins with trypsin...' unique_peptides = set() for description, sequence in fasta.read(gzip.open('yeast.fasta.gz')): new_peptides = parser.cleave(sequence, parser.expasy_rules['trypsin']) unique_peptides.update(new_peptides) print 'Done, {0} sequences obtained!'.format(len(unique_peptides)) peptides = [{'sequence': i} for i in unique_peptides] print 'Parsing peptide sequences...' for peptide in peptides: peptide['parsed_sequence'] = parser.parse(peptide['sequence'], show_unmodified_termini=True) peptide['length'] = parser.length(peptide['parsed_sequence']) print 'Done!' peptides = [peptide for peptide in peptides if peptide['length'] <= 100] print 'Calculating the mass, charge and m/z...' for peptide in peptides: peptide['charge'] = int( round(electrochem.charge(peptide['parsed_sequence'], pH=2.0))) peptide['mass'] = mass.calculate_mass(peptide['parsed_sequence']) peptide['m/z'] = mass.calculate_mass(peptide['parsed_sequence'], charge=peptide['charge']) print 'Done!' print 'Calculating the retention time...'
elif(df['Old'][parser_index] == "*"): parser_index += 1 elif(df['Old'][parser_index] == ","): parser_index += 1 elif(c_term_parsing): if(df['Old'][parser_index]=='C'): if(df['Old'][parser_index+1]=='O'): if(df['Old'][parser_index+2]=='O'): if(df['Old'][parser_index+3]=='H'): parser_index += 4 df['New'][writer_index] = 'O' df['New'][writer_index+1] = 'H' writer_index += 1 c_term_parsing = False else: df['New'][writer_index] = df['Old'][parser_index].lower() parser_index += 1 writer_index += 1 new_seq = "" for i in range(pep_length): new_seq += df['New'][i] df_reduced['New_Sequence'][indexing] = new_seq df_reduced.to_csv('/mnt/compomics/Nicolas/Python/R2TF/data/project_transformed'+str(test['projectid'][projIndex])+'.csv', index=False,header=False) from pyteomics import parser new_seq_2 = "H-QpyrQSEEDLLLQDFSR-OH" parser.parse(new_seq_2, allow_unknown_modifications=True)
print "Warning! Command-line argument: %s not recognized. Exiting..." % opt sys.exit() inputfile01 = open(input_file, "r") # outputfile1 = open(output_file,'w') from pyteomics import parser from pyteomics import mass # gene_list = ['SAA1'] # gene_list = open(gene_list,'r') counter = 0 errcounter = 0 pepinput = "MALTSEYWIILR" ps0 = parser.parse(pepinput, show_unmodified_termini=True) referencemass = mass.calculate_mass(parsed_sequence=ps0) mass_tolerance = 7 # unit: ppm targetmass = 1422.730378 total_pep_list = [] for num, x in enumerate(SeqIO.parse(inputfile01, "fasta")): if num % 10000 == 0: print num # if num > 5000: # break pro = str(x.seq) peplist = digest(pro, enzyme, missed_cleavage, min_pep_length, max_pep_length) if len(peplist) > 0: for p in peplist: total_pep_list.append(p) sort_list = list(set(total_pep_list))
def test_parse_simple(self): for seq in self.simple_sequences: self.assertEqual(seq, ''.join(parser.parse(seq, labels=uppercase)))
def test_calculate_mass(self): # Calculate mass by a formula. self.assertEqual( mass.calculate_mass(formula='ABCDE', mass_data=self.mass_data), sum(self.mass_data[atom][0][0] for atom in 'ABCDE')) # Calculate mass by a sequence. self.assertEqual( mass.calculate_mass(sequence='XYZ', aa_comp=self.aa_comp, mass_data=self.mass_data), sum(self.mass_data[atom][0][0] for atom in 'ABCDE')) # Calculate mass by a parsed sequence. self.assertEqual( mass.calculate_mass(parsed_sequence=['H-', 'X', 'Y', 'Z', '-OH'], aa_comp=self.aa_comp, mass_data=self.mass_data), sum(self.mass_data[atom][0][0] for atom in 'ABCDE')) # Calculate average mass by a formula. self.assertEqual( mass.calculate_mass(formula='ABCDE', average=True, mass_data=self.mass_data), sum(self.mass_data[atom][isotope][0] * self.mass_data[atom][isotope][1] for atom in 'ABCDE' for isotope in self.mass_data[atom] if isotope != 0)) # Calculate m/z of an ion. for charge in [1, 2, 3]: self.assertEqual( mass.calculate_mass(formula='ABCDE', ion_type='M', charge=charge, mass_data=self.mass_data), mass.calculate_mass(formula='ABCDE' + 'H+%d' % (charge, ), mass_data=self.mass_data)) self.assertEqual( mass.calculate_mass(formula='ABCDE', ion_type='M', charge=charge, mass_data=self.mass_data), (mass.calculate_mass(formula='ABCDE', mass_data=self.mass_data) + self.mass_data['H+'][0][0] * charge) / charge) self.assertRaises( auxiliary.PyteomicsError, mass.calculate_mass, **{ 'formula': 'ABCDEH+%d' % charge, 'ion_type': 'M', 'charge': charge, 'mass_data': self.mass_data }) # Sanity check. for pep in self.random_peptides: self.assertEqual( mass.calculate_mass(sequence=pep, aa_comp=self.aa_comp, mass_data=self.mass_data, ion_comp=self.ion_comp), mass.calculate_mass(parsed_sequence=parser.parse( pep, labels=['X', 'Y', 'Z'], show_unmodified_termini=True), aa_comp=self.aa_comp, mass_data=self.mass_data, ion_comp=self.ion_comp))