def use(self, sequence): """ Re-initialize the class with a new sequence This is helpful if one ones to use the same class instance for multiple sequence since it remove class instantiation overhead. Args: sequence (str) - See top for possible input formats. Note: Will clear the current chemical composition dict! """ self.clear() # reset the shiznit if "#" in sequence: # Unimod Style format if self._unimod_parser is None: self._unimod_parser = pyqms.UnimodMapper() self._parse_sequence_unimod_style(sequence) else: self._parse_sequence_old_style(sequence)
def parse_evidence(fixed_labels=None, evidence_files=None, molecules=None, evidence_score_field=None, return_raw_csv_data=False): ''' Reads in the evidence file and returns the final formatted fixed labels, the evidence lookup, which is passed to the isotopologue library and the final formatted molecules (fixed labels are stripped form the molecules). Note: Output .csv files from `Ursgal`_ (`Documentation`_) can directly be used. Also `mzTab`_ files can be used as input. .. _Ursgal: https://github.com/ursgal/ursgal .. _Documentation: http://ursgal.readthedocs.io/en/latest/ .. _mzTab: http://www.psidev.info/mztab Args: fixed_labels (dict): dict with fixed labels, example format is shown below. evidence_files (list): list of evidence file paths. molecules (list): list of additional molecules evidence_score_field (str): specify fieldname which holds the search engine score (Default is "PEP") Example fixed label format:: { 'C' : [ { 'element': { 'O': 1, 'H': 3, '14N': 1, 'C': 2 }, 'evidence_mod_name': 'Carbamidomethyl' }, ] } Returns: tuple: final formatted fixed label dict, evidence lookup, list of molecules ''' if molecules is None: molecules = [] if evidence_score_field is None: evidence_score_field = 'PEP' # default unimod_parser = pyqms.UnimodMapper() fixed_mod_lookup = {} amino_acid_2_fixed_mod_name = ddict(list) formatted_fixed_labels = None evidence_lookup = None molecule_set = set() all_fixed_mod_names = set() if fixed_labels is not None and len(fixed_labels.keys()) != 0: formatted_fixed_labels = {} for aa, fixed_mod_info_dict_list in fixed_labels.items(): for fixed_mod_info_dict in fixed_mod_info_dict_list: if isinstance(fixed_mod_info_dict['element_composition'], dict): tmp_cc_factory = pyqms.chemical_composition.ChemicalComposition( ) tmp_cc_factory.add_chemical_formula( fixed_mod_info_dict['element_composition']) else: tmp_cc_factory = fixed_mod_info_dict['element_composition'] # print(type(tmp_cc_factory)) # print(fixed_mod_info_dict) if aa not in formatted_fixed_labels.keys(): formatted_fixed_labels[aa] = [] formatted_fixed_labels[aa].append( tmp_cc_factory.hill_notation_unimod()) #save it under name and amino acid! fixed_mod_lookup[fixed_mod_info_dict[ 'evidence_mod_name']] = dc(tmp_cc_factory) amino_acid_2_fixed_mod_name[aa].append( fixed_mod_info_dict['evidence_mod_name']) all_fixed_mod_names.add( fixed_mod_info_dict['evidence_mod_name']) tmp_cc_factory.clear() cc_factory = pyqms.chemical_composition.ChemicalComposition() # this is the lookup for the lib with the evidences # tmp_evidences = ddict(list) tmp_evidences = {} csv_raw_data_to_return = {} # tmp_charges_of_evidences = set() for evidence_file in evidence_files: input_is_csv = False evidence_lookup = {} with codecs.open(evidence_file, mode='r', encoding='utf-8') as openend_evidence_file: # first buffer the file here depending on mztab andf csv input if evidence_file.upper().endswith('CSV'): dict_reader = csv.DictReader(openend_evidence_file) modification_fieldname = 'Modifications' rt_fieldname = 'Retention Time (s)' seq_fieldname = 'Sequence' input_is_csv = True elif evidence_file.upper().endswith('MZTAB'): dict_reader = csv.DictReader([ row for row in openend_evidence_file if row[:3] in ['PSM', 'PSH'] ], delimiter='\t') modification_fieldname = 'modifications' rt_fieldname = 'retention_time' seq_fieldname = 'sequence' else: print( 'The format {0} is not recognized by the pyQms adaptor function' .format(os.path.splitext(evidence_file)[1])) input_buffer = [] for line_dict in dict_reader: input_buffer.append(line_dict) csv_raw_data_to_return[evidence_file] = input_buffer for line_dict in input_buffer: modifications = line_dict.get(modification_fieldname, '') if modifications == '': molecule = line_dict[seq_fieldname] else: if input_is_csv: formatted_mods = line_dict[modification_fieldname] else: formatted_mods = [] # 2-UNIMOD:4,3-UNIMOD:4 for pos_and_unimod_id in line_dict[ modification_fieldname].split(','): pos, unimod_id = pos_and_unimod_id.split('-') unimod_name = unimod_parser.id2name( unimod_id.split(':')[1]) formatted_mods.append('{0}:{1}'.format( unimod_name, pos)) formatted_mods = ';'.join(formatted_mods) molecule = '{0}#{1}'.format(line_dict[seq_fieldname], formatted_mods) dict_2_append = {} rt = line_dict.get(rt_fieldname, '') #seconds is the standard also for mzTab if rt != '': dict_2_append['RT'] = float(rt) / 60.0 # always in min score = line_dict.get(evidence_score_field, '') if score != '': dict_2_append['score'] = float(score) dict_2_append['score_field'] = evidence_score_field else: dict_2_append['score'] = 'None' dict_2_append['score_field'] = 'None' if molecule not in tmp_evidences.keys(): tmp_evidences[molecule] = { 'evidences': [], 'trivial_names': set(), } tmp_evidences[molecule]['evidences'].append(dict_2_append) for trivial_name_key in [ 'proteinacc_start_stop_pre_post_;', # old ursgal style 'trivial_name', # self defined name 'Protein ID', # new ursgal style 'accession' # mzTab style ]: additional_name = line_dict.get(trivial_name_key, '') if additional_name != '': # use set to remove double values tmp_evidences[molecule]['trivial_names'].add( additional_name) mod_pattern = re.compile(r''':(?P<pos>[0-9]*$)''') all_molecules = list(molecules) if len(tmp_evidences.keys()) > 0: all_molecules += list(tmp_evidences.keys()) for molecule_and_mods in sorted(all_molecules): # try to convert trivial name set to list for conveniences try: tmp_evidences[molecule_and_mods]['trivial_names'] = sorted( list(set(tmp_evidences[molecule_and_mods]['trivial_names']))) except: pass # print(molecule_and_mods) if '#' in molecule_and_mods: molecule, modifications = molecule_and_mods.split('#') else: molecule = molecule_and_mods modifications = None fixed_label_mod_addon_names = [] if modifications is not None: mods_to_delete = [] mod_list = modifications.split(';') for pos_in_mod_list, mod_and_pos in enumerate(mod_list): # OLD STYLE, no ':' in mod allowed! # mod, pos = mod_and_pos.split(':') # NEW STYLE, SILAC does not crash... for match in mod_pattern.finditer(mod_and_pos): pos = int(match.group('pos')) mod = mod_and_pos[:match.start()] break modded_aa = molecule[int(pos) - 1] if formatted_fixed_labels is not None and modded_aa in formatted_fixed_labels.keys( ) and mod in all_fixed_mod_names: fixed_label_mod_addon_names.append(mod) mods_to_delete.append(pos_in_mod_list) for modpos_2_remove in sorted(mods_to_delete, reverse=True): mod_list.pop(modpos_2_remove) if len(mod_list) > 0: molecule = '{0}#{1}'.format(molecule, ';'.join(mod_list)) else: # nosetest does not line else and pass # molecule = molecule pass else: # fail check if fixed mod is not in the modifications! # add all fixed modification! if formatted_fixed_labels is not None: for aa in molecule: if aa in formatted_fixed_labels.keys(): for mod_name in amino_acid_2_fixed_mod_name[aa]: fixed_label_mod_addon_names.append(mod_name) # print(molecule) if molecule.startswith('+'): cc_factory.add_chemical_formula(molecule) else: cc_factory.use(molecule) if len(fixed_label_mod_addon_names) != 0: for fixed_mod_name in fixed_label_mod_addon_names: cc_factory.add_chemical_formula( fixed_mod_lookup[fixed_mod_name]) complete_formula = cc_factory.hill_notation_unimod() molecule_set.add(molecule) if molecule_and_mods in tmp_evidences.keys(): if complete_formula not in evidence_lookup.keys(): evidence_lookup[complete_formula] = {} evidence_lookup[complete_formula][molecule_and_mods] = \ tmp_evidences[molecule_and_mods] cc_factory.clear() molecule_list = list(molecule_set) if return_raw_csv_data: return formatted_fixed_labels, evidence_lookup, molecule_list, csv_raw_data_to_return else: return formatted_fixed_labels, evidence_lookup, molecule_list
def setUp( self ): self.alt_mapper = pyqms.UnimodMapper() self.alt_mapper.unimod_xml_name = 'wrong_unimod_xml_name.xml'
#!/usr/bin/env python3.4 # encoding: utf-8 import pyqms import unittest M = pyqms.UnimodMapper() UNIMODMAPPER_FUNCTIONS = [ M.name2mass, M.name2composition, M.name2id, M.id2mass, M.id2composition, M.id2name, M.mass2name_list, M.mass2id_list, M.mass2composition_list, M.appMass2id_list, M.appMass2element_list, M.appMass2name_list, M.composition2name_list, M.name2specificity_site_list, M.composition2id_list, M.composition2mass, M._map_key_2_index_2_value, ] TESTS = [ [ {
class ChemicalComposition(dict): """ Chemical composition class. The actual sequence or formula can be reset using the `add` function. Keyword Arguments: sequence (str): Peptide or chemical formula sequence aa_compositions (Optional[dict]): amino acid compositions isotopic_distributions (Optional[dict]): isotopic distributions Keyword argument examples: * **sequence** This can for example be:: molecules = [ '+H2O2H2-OH', '+{0}'.format('H2O'), '{peptide}'.format(pepitde='ELVISLIVES'), '{peptide}+{0}'.format('PO3', peptide='ELVISLIVES'), '{peptide}#{unimod}:{pos}'.format( peptide = 'ELVISLIVES', unimod = 'Oxidation', pos = 1 ) ] Examples: >>> c = pyqms.ChemicalComposition() >>> c.use("ELVISLIVES#Acetyl:1") >>> c.hill_notation() 'C52H90N10O18' >>> c.hill_notation_unimod() 'C(52)H(90)N(10)O(18)' >>> c {'O': 18, 'H': 90, 'C': 52, 'N': 10} >>> c.composition_of_mod_at_pos[1] defaultdict(<class 'int'>, {'O': 1, 'H': 2, 'C': 2}) >>> c.composition_of_aa_at_pos[1] {'O': 3, 'H': 7, 'C': 5, 'N': 1} >>> c.composition_at_pos[1] defaultdict(<class 'int'>, {'O': 4, 'H': 9, 'C': 7, 'N': 1}) >>> c = pyqms.ChemicalComposition('+H2O2H2') >>> c {'O': 2, 'H': 4} >>> c.subtract_chemical_formula('H3') >>> c {'O': 2, 'H': 1} Note: We did not include mass calculation, since pyQms will do it much more accurately using unimod and other element enrichments. """ _unimod_parser = pyqms.UnimodMapper() def __init__(self, sequence=None, aa_compositions=None, isotopic_distributions=None): # self._unimod_parser = None self.composition_of_mod_at_pos = {} """dict: chemical composition of unimod modifications at given position (if peptide sequence was used as input or using the `use` function) Note: Numbering starts at position 1, since all PSM search engines use this nomenclature. """ self.composition_of_aa_at_pos = {} """dict: chemical composition of amino acid at given peptide position (if peptide sequence was used as input or using the `use` function) Note: Numbering starts at position 1, since all PSM search engines use this nomenclature. Examples:: c.composition_of_mod_at_pos[1] = { '15N': 2, '13C': 6, 'N': -2, 'C': -6 } """ self.composition_at_pos = {} """dict: chemical composition at given peptide position incl modifications (if peptide sequence was used as input or using the `use` function) Note: Numbering starts at position 1, since all PSM search engines use this nomenclature. """ self.peptide = None self.addon = None self.unimod_at_pos = {} # self.regex_patterns = { # ':pos' : re.compile( r''':(?P<pos>[0-9]*)''' ), # 'aaN' : re.compile( r'''(?P<aa>[A-Z]{1})(?P<N>[0-9]*)''' ), # } if aa_compositions is None: self.aa_compositions = pyqms.knowledge_base.aa_compositions else: self.aa_compositions = aa_compositions if isotopic_distributions is None: self.isotopic_distributions = pyqms.knowledge_base.isotopic_distributions else: self.isotopic_distributions = isotopic_distributions if sequence is not None: self.use(sequence) def __add__(self, other_cc): """Experimental""" tmp = copy.deepcopy(self) for other_key, other_value in other_cc.items(): tmp[other_key] += other_value return tmp def __missing__(self, key): if key not in self.keys(): self[key] = 0 return self[key] def __repr__(self): return self.hill_notation() def clear(self): """ Resets all lookup dictionaries and self One class instance can be used analysing a series of sequences, thereby avoiding class instantiation overhead. Warning: Make sure to reset when looping over sequences and use the class. Chemical formulas (elemental compositions) will accumulate if not resetted. """ self.composition_of_mod_at_pos.clear() self.composition_of_aa_at_pos.clear() self.composition_at_pos.clear() self.unimod_at_pos.clear() self.peptide = None self.addon = None for k in list(self.keys()): del self[k] def _parse_sequence_old_style(self, sequence): """ Adaptor for obsolete piqDB format. """ positions = [len(sequence)] for sign in ["+", "-"]: if sign in sequence: positions.append(sequence.index(sign)) minPos = min(positions) peptide = sequence[:minPos] addon = sequence[minPos:] self.peptide = peptide self.addon = addon if peptide != "": self.add_peptide(peptide) self["O"] += 1 self["H"] += 2 chemical_formula_blocks = re.compile( r""" [+|-]{1} [^-+]* """, re.VERBOSE, ).findall(addon) for cb in chemical_formula_blocks: if cb[0] == "+": self.add_chemical_formula(cb[1:]) else: self.subtract_chemical_formula(cb[1:]) return def _parse_sequence_unimod_style(self, sequence): """ Sequence and modification parser in current format. Can hold also the modification information i.e. fixed or variable mods. Note: Sequences must not have two modifications at the same position! Example: '{peptide}#{unimod}:{pos}'.format( peptide = 'ELVISLIVES', unimod = 'Oxidation', pos = 1 ) """ minPos = sequence.index("#") peptide = sequence[:minPos] addon = sequence[minPos + 1:] self.peptide = peptide if peptide != "": self.add_peptide(peptide) self["O"] += 1 self["H"] += 2 self.addon = addon unimods = self.addon.split(";") # pattern = self.regex_patterns[':pos'] pattern = re.compile(r""":(?P<pos>[0-9]*$)""") for unimod in unimods: if unimod == "": continue # print( unimod ) unimodcomposition = None unimod = unimod.strip() if ':' not in unimod: print( 'This unimod: {0} requires positional information'.format( unimod)) sys.exit(1) for occ, match in enumerate(pattern.finditer(unimod)): try: unimodcomposition = ChemicalComposition._unimod_parser.name2composition( unimod[:match.start()]) except: print( "Can not map unimod {0}. extracted position argument {1}" .format(unimod, match.start())) sys.exit(1) if unimodcomposition is None: print( 'This unimod: {0} could not be mapped and thus no CC could be read' .format(unimod)) sys.exit(1) position = int(match.group("pos")) if position in self.unimod_at_pos.keys(): sys.exit( "{0} <<- Two unimods at the same position ? ".format( sequence)) self.unimod_at_pos[position] = unimod[:match.start()] # match = re.search( position_re_pattern, unimod) # if match is not None: # end = match.start() # print( '>>>>', match) # else: # end = len( unimod ) # try: # unimodcomposition = self._unimod_parser.name2composition( # unimod[:end ] # ) # except: # print( # 'Unimod error:', unimod,'>>', unimod[:end], # re.search( position_re_pattern , unimod), # re.search( position_re_pattern , unimod).start() # ) # exit(1) # print( self , 'peptide only') # print( 'Unimod:', unimod, unimod[:end] , ) # Full addition # print( unimodcomposition , '<<<<<<', ChemicalComposition._unimod_parser) for k, v in unimodcomposition.items(): self[k] += v # storage position related modifications position = int(match.group("pos")) if position == 0: # E.g. Acetylation at pos 0 indicates N-Term # but has to be counted for position 1 in this class position = 1 if position not in self.composition_of_mod_at_pos.keys(): self.composition_of_mod_at_pos[position] = ddict(int) if position not in self.composition_at_pos.keys(): self.composition_at_pos[position] = ddict(int) for k, v in unimodcomposition.items(): self.composition_of_mod_at_pos[position][k] += v self.composition_at_pos[position][k] += v return def use(self, sequence): """ Re-initialize the class with a new sequence This is helpful if one ones to use the same class instance for multiple sequence since it remove class instantiation overhead. Args: sequence (str) - See top for possible input formats. Note: Will clear the current chemical composition dict! """ self.clear() # reset the shiznit if "#" in sequence: # Unimod Style format if self._unimod_parser is None: self._unimod_parser = pyqms.UnimodMapper() self._parse_sequence_unimod_style(sequence) else: self._parse_sequence_old_style(sequence) def add_chemical_formula(self, chemical_formula): """ Adds chemical formula to the instance Chemical formula can be a string or a dictionary with the element count. For example:: chemical_formula = 'C18H36N9O18' chemical_formula = { 'C' : 18, 'H' : 36, 'N' : 9, 'O' : 18 } """ self._merge(chemical_formula, mode="addition") return def add_peptide(self, peptide): """ Adds peptide sequence to the instance. Note: Only standard amino acids can be processed. If one uses special amino acids like (U of F) they have to be added to knowledge_base.py. """ # pattern = self.regex_patterns['aaN'] pattern = re.compile(r"""(?P<aa>[A-Z]{1})(?P<N>[0-9]*)""") # pattern = re.compile(r'[A-Z]{1}[0-9]*') number_offset = 0 # this are the count for e.g. SILAC aa, i.e. R0 R1 or C0 and so on ... # print( peptide, type( peptide )) for aaN_match in pattern.finditer(peptide): aa = aaN_match.group("aa") N = aaN_match.group("N") pos = int(aaN_match.start()) - number_offset + 1 if N != "": number_offset += len(N) try: aa_compo = self.aa_compositions[aa + N] except: sys.exit("Do not know aa composition for {0}".format(aa + N)) self.add_chemical_formula(aa_compo) composition = self._chemical_formula_to_dict(aa_compo) self.composition_of_aa_at_pos[pos] = composition if pos not in self.composition_at_pos.keys(): self.composition_at_pos[pos] = ddict(int) for k, v in composition.items(): self.composition_at_pos[pos][k] += v def _chemical_formula_to_dict(self, chemical_formula): """ Internal function to convert a chemical formula as string to a dictionary. """ if isinstance(chemical_formula, pyqms.ChemicalComposition): chem_dict = chemical_formula else: unimod_style = False if "(" in chemical_formula: unimod_style = True chem_dict = {} # print( chemical_formula , type( chemical_formula )) # pattern = re.compile(r'(?P<element>[A-Z][a-z]*)(?P<count>[0-9]*)') if unimod_style: pattern = re.compile( r"(?P<isotop>[0-9]*)(?P<element>[A-Z][a-z]*)\((?P<count>[0-9]*)\)" ) else: pattern = re.compile( r"(?P<element>[A-Z][a-z]*)(?P<count>[0-9]*)") for match in pattern.finditer(chemical_formula): if match.group("count") == "": count = 1 else: count = int(match.group("count")) if unimod_style: element_key = "{0}{1}".format(match.group("isotop"), match.group("element")) else: element_key = match.group("element") if element_key not in chem_dict.keys(): chem_dict[element_key] = 0 chem_dict[element_key] += count return chem_dict def hill_notation(self, include_ones=False, cc=None): """ Formats chemical composition into `Hill notation`_ string. .. _Hill Notation: https://en.wikipedia.org/wiki/Hill_system Args: cc (dict, optional): elemental composition dict Returns: str: Hill notation format of self. For example:: 'C50H88N10O17' """ MAJORS = ["C", "H"] s = "" if cc is None: cc_dict = self else: cc_dict = cc for major in MAJORS: if major in cc_dict.keys(): if cc_dict[major] == 0: continue s += major if include_ones or cc_dict[major] > 1: s += str(cc_dict[major]) for k in sorted(cc_dict.keys()): if k not in MAJORS: if cc_dict[k] == 0: continue s += k if include_ones or cc_dict[k] > 1: s += str(cc_dict[k]) return s # def hill_notation(self, include_ones=False): def hill_notation_unimod(self, cc=None): """ Formats chemical composition into `Hill notation`_ string adding `unimod`_ features. .. _Hill Notation: https://en.wikipedia.org/wiki/Hill_system .. _unimod: http://www.unimod.org/fields.html Args: cc (dict, optional): elemental composition dict Returns: str: Hill notation format including unimod format rules of self. For example:: 'C(50)H(88)N(10)O(17)' 'C(50)H(88)14N(1)N(9)(17)' """ MAJORS = ["C", "H"] s = "" if cc is None: cc_dict = self else: cc_dict = cc for major in MAJORS: if major in cc_dict.keys(): if cc_dict[major] == 0: continue s += "{0}({1})".format( major.replace("(", "").replace(")", ""), cc_dict[major]) for k in sorted(cc_dict.keys()): if k not in MAJORS: if cc_dict[k] == 0: continue s += "{0}({1})".format( k.replace("(", "").replace(")", ""), cc_dict[k]) return s def _mass(self, cc=None): """ Calculate the mass of the chemical composition. Optional cc can be specified, i.e. a cc dict in the style of { element : count , ... } This does however not work with enriched elements, e.g. 15N Rather use pyqms.isotopologue_library for more accurate mass calculations. """ mass = 0 if cc is None: cc_mass_dict = self else: cc_mass_dict = cc for element, count in cc_mass_dict.items(): if element not in self.isotopic_distributions.keys(): match = re.search( '(?P<isotope>[0-9]*)(?P<element>[A-Z][a-z]*$)', element) for _emass, _distribution in self.isotopic_distributions[ match.group('element')]: if int(round(_emass)) == int(match.group('isotope')): emass = _emass break else: emass = self.isotopic_distributions[element][0][0] mass += count * emass return mass def _merge(self, chemical_formula, mode="addition"): """ Generalized function that allows addition and subtraction """ if mode == "addition": sign = +1 else: sign = -1 if isinstance(chemical_formula, str): chemical_formula = self._chemical_formula_to_dict(chemical_formula) for element, count in chemical_formula.items(): self[element] = self[element] + sign * count # else: # print(chemical_formula, type(chemical_formula)) # sys.exit('Do not know the format of the chemical formula') return def subtract_peptide(self, peptide): """ Subtracts peptide (chemical formula) from instance. """ for aa in peptide: self.subtract_chemical_formula(self.aa_compositions[aa]) def subtract_chemical_formula(self, chemical_formula): """ Subtracts chemical formula from instance. """ self._merge(chemical_formula, mode="subtraction") return def generate_cc_dict(self): tmp = {} tmp.update(self) return tmp