Esempio n. 1
0
    def use(self, sequence):
        """
        Re-initialize the class with a new sequence

        This is helpful if one ones to use the same class instance
        for multiple sequence since it remove class instantiation overhead.

        Args:
            sequence (str) - See top for possible input formats.

        Note:

            Will clear the current chemical composition dict!
        """

        self.clear()
        # reset the shiznit
        if "#" in sequence:
            # Unimod Style format
            if self._unimod_parser is None:
                self._unimod_parser = pyqms.UnimodMapper()
            self._parse_sequence_unimod_style(sequence)
        else:
            self._parse_sequence_old_style(sequence)
Esempio n. 2
0
def parse_evidence(fixed_labels=None,
                   evidence_files=None,
                   molecules=None,
                   evidence_score_field=None,
                   return_raw_csv_data=False):
    '''
    Reads in the evidence file and returns the final formatted fixed labels,
    the evidence lookup, which is passed to the isotopologue library and the
    final formatted molecules (fixed labels are stripped form the molecules).

    Note:

        Output .csv files from `Ursgal`_ (`Documentation`_) can directly be
        used. Also `mzTab`_ files can be used as input.

    .. _Ursgal:
        https://github.com/ursgal/ursgal

    .. _Documentation:
        http://ursgal.readthedocs.io/en/latest/

    .. _mzTab:
        http://www.psidev.info/mztab

    Args:
        fixed_labels (dict): dict with fixed labels, example format is shown
            below.
        evidence_files (list): list of evidence file paths.
        molecules (list): list of additional molecules
        evidence_score_field (str): specify fieldname which holds the search
            engine score (Default is "PEP")

    Example fixed label format::

        {
            'C' : [
                {
                    'element': {
                        'O': 1,
                        'H': 3,
                        '14N': 1,
                        'C': 2
                    },
                    'evidence_mod_name': 'Carbamidomethyl'
                },
            ]
        }

    Returns:

        tuple: final formatted fixed label dict, evidence lookup, list of molecules

    '''
    if molecules is None:
        molecules = []
    if evidence_score_field is None:
        evidence_score_field = 'PEP'  #  default

    unimod_parser = pyqms.UnimodMapper()

    fixed_mod_lookup = {}
    amino_acid_2_fixed_mod_name = ddict(list)

    formatted_fixed_labels = None
    evidence_lookup = None
    molecule_set = set()

    all_fixed_mod_names = set()

    if fixed_labels is not None and len(fixed_labels.keys()) != 0:
        formatted_fixed_labels = {}
        for aa, fixed_mod_info_dict_list in fixed_labels.items():
            for fixed_mod_info_dict in fixed_mod_info_dict_list:
                if isinstance(fixed_mod_info_dict['element_composition'],
                              dict):
                    tmp_cc_factory = pyqms.chemical_composition.ChemicalComposition(
                    )
                    tmp_cc_factory.add_chemical_formula(
                        fixed_mod_info_dict['element_composition'])
                else:
                    tmp_cc_factory = fixed_mod_info_dict['element_composition']
                # print(type(tmp_cc_factory))
                # print(fixed_mod_info_dict)
                if aa not in formatted_fixed_labels.keys():
                    formatted_fixed_labels[aa] = []
                formatted_fixed_labels[aa].append(
                    tmp_cc_factory.hill_notation_unimod())
                #save it under name and amino acid!
                fixed_mod_lookup[fixed_mod_info_dict[
                    'evidence_mod_name']] = dc(tmp_cc_factory)
                amino_acid_2_fixed_mod_name[aa].append(
                    fixed_mod_info_dict['evidence_mod_name'])
                all_fixed_mod_names.add(
                    fixed_mod_info_dict['evidence_mod_name'])
                tmp_cc_factory.clear()

    cc_factory = pyqms.chemical_composition.ChemicalComposition()

    # this is the lookup for the lib with the evidences
    # tmp_evidences = ddict(list)
    tmp_evidences = {}

    csv_raw_data_to_return = {}
    # tmp_charges_of_evidences = set()
    for evidence_file in evidence_files:
        input_is_csv = False
        evidence_lookup = {}
        with codecs.open(evidence_file, mode='r',
                         encoding='utf-8') as openend_evidence_file:
            # first buffer the file here depending on mztab andf csv input
            if evidence_file.upper().endswith('CSV'):
                dict_reader = csv.DictReader(openend_evidence_file)
                modification_fieldname = 'Modifications'
                rt_fieldname = 'Retention Time (s)'
                seq_fieldname = 'Sequence'
                input_is_csv = True
            elif evidence_file.upper().endswith('MZTAB'):
                dict_reader = csv.DictReader([
                    row for row in openend_evidence_file
                    if row[:3] in ['PSM', 'PSH']
                ],
                                             delimiter='\t')
                modification_fieldname = 'modifications'
                rt_fieldname = 'retention_time'
                seq_fieldname = 'sequence'
            else:
                print(
                    'The format {0} is not recognized by the pyQms adaptor function'
                    .format(os.path.splitext(evidence_file)[1]))

            input_buffer = []
            for line_dict in dict_reader:
                input_buffer.append(line_dict)
            csv_raw_data_to_return[evidence_file] = input_buffer
            for line_dict in input_buffer:

                modifications = line_dict.get(modification_fieldname, '')
                if modifications == '':
                    molecule = line_dict[seq_fieldname]
                else:
                    if input_is_csv:
                        formatted_mods = line_dict[modification_fieldname]
                    else:
                        formatted_mods = []
                        # 2-UNIMOD:4,3-UNIMOD:4
                        for pos_and_unimod_id in line_dict[
                                modification_fieldname].split(','):
                            pos, unimod_id = pos_and_unimod_id.split('-')
                            unimod_name = unimod_parser.id2name(
                                unimod_id.split(':')[1])
                            formatted_mods.append('{0}:{1}'.format(
                                unimod_name, pos))
                        formatted_mods = ';'.join(formatted_mods)

                    molecule = '{0}#{1}'.format(line_dict[seq_fieldname],
                                                formatted_mods)

                dict_2_append = {}
                rt = line_dict.get(rt_fieldname, '')
                #seconds is the standard also for mzTab
                if rt != '':
                    dict_2_append['RT'] = float(rt) / 60.0  # always in min

                score = line_dict.get(evidence_score_field, '')
                if score != '':
                    dict_2_append['score'] = float(score)
                    dict_2_append['score_field'] = evidence_score_field
                else:
                    dict_2_append['score'] = 'None'
                    dict_2_append['score_field'] = 'None'

                if molecule not in tmp_evidences.keys():
                    tmp_evidences[molecule] = {
                        'evidences': [],
                        'trivial_names': set(),
                    }
                tmp_evidences[molecule]['evidences'].append(dict_2_append)
                for trivial_name_key in [
                        'proteinacc_start_stop_pre_post_;',  # old ursgal style
                        'trivial_name',  # self defined name
                        'Protein ID',  # new ursgal style
                        'accession'  # mzTab style
                ]:
                    additional_name = line_dict.get(trivial_name_key, '')
                    if additional_name != '':
                        # use set to remove double values
                        tmp_evidences[molecule]['trivial_names'].add(
                            additional_name)

    mod_pattern = re.compile(r''':(?P<pos>[0-9]*$)''')

    all_molecules = list(molecules)

    if len(tmp_evidences.keys()) > 0:
        all_molecules += list(tmp_evidences.keys())

    for molecule_and_mods in sorted(all_molecules):
        # try to convert trivial name set to list for conveniences
        try:
            tmp_evidences[molecule_and_mods]['trivial_names'] = sorted(
                list(set(tmp_evidences[molecule_and_mods]['trivial_names'])))
        except:
            pass
        # print(molecule_and_mods)
        if '#' in molecule_and_mods:
            molecule, modifications = molecule_and_mods.split('#')
        else:
            molecule = molecule_and_mods
            modifications = None
        fixed_label_mod_addon_names = []
        if modifications is not None:
            mods_to_delete = []
            mod_list = modifications.split(';')
            for pos_in_mod_list, mod_and_pos in enumerate(mod_list):
                # OLD STYLE, no ':' in mod allowed!
                # mod, pos = mod_and_pos.split(':')
                # NEW STYLE, SILAC does not crash...
                for match in mod_pattern.finditer(mod_and_pos):
                    pos = int(match.group('pos'))
                    mod = mod_and_pos[:match.start()]
                    break

                modded_aa = molecule[int(pos) - 1]

                if formatted_fixed_labels is not None and modded_aa in formatted_fixed_labels.keys(
                ) and mod in all_fixed_mod_names:
                    fixed_label_mod_addon_names.append(mod)
                    mods_to_delete.append(pos_in_mod_list)

            for modpos_2_remove in sorted(mods_to_delete, reverse=True):
                mod_list.pop(modpos_2_remove)

            if len(mod_list) > 0:
                molecule = '{0}#{1}'.format(molecule, ';'.join(mod_list))
            else:
                # nosetest does not line else and pass
                # molecule = molecule
                pass
        else:
            # fail check if fixed mod is not in the modifications!
            # add all fixed modification!
            if formatted_fixed_labels is not None:
                for aa in molecule:
                    if aa in formatted_fixed_labels.keys():
                        for mod_name in amino_acid_2_fixed_mod_name[aa]:
                            fixed_label_mod_addon_names.append(mod_name)
        # print(molecule)
        if molecule.startswith('+'):
            cc_factory.add_chemical_formula(molecule)
        else:
            cc_factory.use(molecule)
        if len(fixed_label_mod_addon_names) != 0:
            for fixed_mod_name in fixed_label_mod_addon_names:
                cc_factory.add_chemical_formula(
                    fixed_mod_lookup[fixed_mod_name])
        complete_formula = cc_factory.hill_notation_unimod()

        molecule_set.add(molecule)
        if molecule_and_mods in tmp_evidences.keys():
            if complete_formula not in evidence_lookup.keys():
                evidence_lookup[complete_formula] = {}
            evidence_lookup[complete_formula][molecule_and_mods] = \
                tmp_evidences[molecule_and_mods]

        cc_factory.clear()

    molecule_list = list(molecule_set)

    if return_raw_csv_data:
        return formatted_fixed_labels, evidence_lookup, molecule_list, csv_raw_data_to_return
    else:
        return formatted_fixed_labels, evidence_lookup, molecule_list
Esempio n. 3
0
 def setUp( self ):
     self.alt_mapper = pyqms.UnimodMapper()
     self.alt_mapper.unimod_xml_name = 'wrong_unimod_xml_name.xml'
Esempio n. 4
0
#!/usr/bin/env python3.4
# encoding: utf-8

import pyqms
import unittest

M = pyqms.UnimodMapper()

UNIMODMAPPER_FUNCTIONS = [
    M.name2mass,
    M.name2composition,
    M.name2id,
    M.id2mass,
    M.id2composition,
    M.id2name,
    M.mass2name_list,
    M.mass2id_list,
    M.mass2composition_list,
    M.appMass2id_list,
    M.appMass2element_list,
    M.appMass2name_list,
    M.composition2name_list,
    M.name2specificity_site_list,
    M.composition2id_list,
    M.composition2mass,
    M._map_key_2_index_2_value,
]

TESTS = [
    [
        {
Esempio n. 5
0
class ChemicalComposition(dict):
    """
    Chemical composition class. The actual sequence or formula can be reset
    using the `add` function.

    Keyword Arguments:
        sequence (str): Peptide or chemical formula sequence
        aa_compositions (Optional[dict]): amino acid compositions
        isotopic_distributions (Optional[dict]): isotopic distributions


    Keyword argument examples:

        * **sequence**

    This can for example be::

        molecules = [
            '+H2O2H2-OH',
            '+{0}'.format('H2O'),
            '{peptide}'.format(pepitde='ELVISLIVES'),
            '{peptide}+{0}'.format('PO3', peptide='ELVISLIVES'),
            '{peptide}#{unimod}:{pos}'.format(
                peptide = 'ELVISLIVES',
                unimod = 'Oxidation',
                pos = 1
            )
        ]

    Examples:

        >>> c = pyqms.ChemicalComposition()
        >>> c.use("ELVISLIVES#Acetyl:1")
        >>> c.hill_notation()
        'C52H90N10O18'
        >>> c.hill_notation_unimod()
        'C(52)H(90)N(10)O(18)'
        >>> c
        {'O': 18, 'H': 90, 'C': 52, 'N': 10}
        >>> c.composition_of_mod_at_pos[1]
        defaultdict(<class 'int'>, {'O': 1, 'H': 2, 'C': 2})
        >>> c.composition_of_aa_at_pos[1]
        {'O': 3, 'H': 7, 'C': 5, 'N': 1}
        >>> c.composition_at_pos[1]
        defaultdict(<class 'int'>, {'O': 4, 'H': 9, 'C': 7, 'N': 1})

        >>> c = pyqms.ChemicalComposition('+H2O2H2')
        >>> c
        {'O': 2, 'H': 4}
        >>> c.subtract_chemical_formula('H3')
        >>> c
        {'O': 2, 'H': 1}


    Note:
        We did not include mass calculation, since pyQms will do it much
        more accurately using unimod and other element enrichments.


    """
    _unimod_parser = pyqms.UnimodMapper()

    def __init__(self,
                 sequence=None,
                 aa_compositions=None,
                 isotopic_distributions=None):

        # self._unimod_parser = None
        self.composition_of_mod_at_pos = {}
        """dict: chemical composition of unimod modifications at given position
        (if peptide sequence was used as input or using the `use` function)

        Note:
            Numbering starts at position 1, since all PSM search engines
            use this nomenclature.
        """
        self.composition_of_aa_at_pos = {}
        """dict: chemical composition of amino acid at given peptide position
        (if peptide sequence was used as input or using the `use` function)

        Note:
            Numbering starts at position 1, since all PSM search engines
            use this nomenclature.

            Examples::

                c.composition_of_mod_at_pos[1] = {
                    '15N': 2, '13C': 6, 'N': -2, 'C': -6
                }

        """
        self.composition_at_pos = {}
        """dict: chemical composition at given peptide position incl modifications
        (if peptide sequence was used as input or using the `use` function)

        Note:
            Numbering starts at position 1, since all PSM search engines
            use this nomenclature.
        """
        self.peptide = None
        self.addon = None
        self.unimod_at_pos = {}
        # self.regex_patterns = {
        #     ':pos' : re.compile( r''':(?P<pos>[0-9]*)''' ),
        #     'aaN'  : re.compile( r'''(?P<aa>[A-Z]{1})(?P<N>[0-9]*)''' ),
        # }
        if aa_compositions is None:
            self.aa_compositions = pyqms.knowledge_base.aa_compositions
        else:
            self.aa_compositions = aa_compositions
        if isotopic_distributions is None:
            self.isotopic_distributions = pyqms.knowledge_base.isotopic_distributions
        else:
            self.isotopic_distributions = isotopic_distributions
        if sequence is not None:
            self.use(sequence)

    def __add__(self, other_cc):
        """Experimental"""
        tmp = copy.deepcopy(self)
        for other_key, other_value in other_cc.items():
            tmp[other_key] += other_value
        return tmp

    def __missing__(self, key):
        if key not in self.keys():
            self[key] = 0
        return self[key]

    def __repr__(self):
        return self.hill_notation()

    def clear(self):
        """
        Resets all lookup dictionaries and self

        One class instance can be used analysing a series of sequences, thereby
        avoiding class instantiation overhead.

        Warning:

            Make sure to reset when looping over sequences and use the class.
            Chemical formulas (elemental compositions) will accumulate if not
            resetted.
        """

        self.composition_of_mod_at_pos.clear()
        self.composition_of_aa_at_pos.clear()
        self.composition_at_pos.clear()
        self.unimod_at_pos.clear()

        self.peptide = None
        self.addon = None
        for k in list(self.keys()):
            del self[k]

    def _parse_sequence_old_style(self, sequence):
        """
        Adaptor for obsolete piqDB format.
        """
        positions = [len(sequence)]
        for sign in ["+", "-"]:
            if sign in sequence:
                positions.append(sequence.index(sign))
        minPos = min(positions)
        peptide = sequence[:minPos]
        addon = sequence[minPos:]
        self.peptide = peptide
        self.addon = addon
        if peptide != "":
            self.add_peptide(peptide)
            self["O"] += 1
            self["H"] += 2

        chemical_formula_blocks = re.compile(
            r"""
                        [+|-]{1}
                        [^-+]*
                        """,
            re.VERBOSE,
        ).findall(addon)
        for cb in chemical_formula_blocks:
            if cb[0] == "+":
                self.add_chemical_formula(cb[1:])
            else:
                self.subtract_chemical_formula(cb[1:])
        return

    def _parse_sequence_unimod_style(self, sequence):
        """
        Sequence and modification parser in current format. Can hold also the
        modification information i.e. fixed or variable mods.

        Note:

            Sequences must not have two modifications at the same position!

        Example:

            '{peptide}#{unimod}:{pos}'.format(
                peptide = 'ELVISLIVES',
                unimod = 'Oxidation',
                pos = 1
            )
        """
        minPos = sequence.index("#")
        peptide = sequence[:minPos]
        addon = sequence[minPos + 1:]
        self.peptide = peptide
        if peptide != "":
            self.add_peptide(peptide)
            self["O"] += 1
            self["H"] += 2
        self.addon = addon
        unimods = self.addon.split(";")
        # pattern = self.regex_patterns[':pos']
        pattern = re.compile(r""":(?P<pos>[0-9]*$)""")
        for unimod in unimods:
            if unimod == "":
                continue
            # print( unimod )
            unimodcomposition = None

            unimod = unimod.strip()

            if ':' not in unimod:
                print(
                    'This unimod: {0} requires positional information'.format(
                        unimod))
                sys.exit(1)
            for occ, match in enumerate(pattern.finditer(unimod)):
                try:
                    unimodcomposition = ChemicalComposition._unimod_parser.name2composition(
                        unimod[:match.start()])
                except:
                    print(
                        "Can not map unimod {0}. extracted position argument {1}"
                        .format(unimod, match.start()))
                    sys.exit(1)

                if unimodcomposition is None:
                    print(
                        'This unimod: {0} could not be mapped and thus no CC could be read'
                        .format(unimod))
                    sys.exit(1)
                position = int(match.group("pos"))

                if position in self.unimod_at_pos.keys():
                    sys.exit(
                        "{0} <<- Two unimods at the same position ? ".format(
                            sequence))
                self.unimod_at_pos[position] = unimod[:match.start()]
            # match = re.search( position_re_pattern, unimod)
            # if match is not None:
            #     end = match.start()
            #     print( '>>>>', match)
            # else:
            #     end = len( unimod )
            # try:
            #     unimodcomposition = self._unimod_parser.name2composition(
            #         unimod[:end ]
            #     )
            # except:
            #     print(
            #         'Unimod error:', unimod,'>>', unimod[:end],
            #         re.search( position_re_pattern , unimod),
            #         re.search( position_re_pattern , unimod).start()
            #     )
            #     exit(1)
            # print( self , 'peptide only')
            # print( 'Unimod:', unimod, unimod[:end] , )
            # Full addition
            # print( unimodcomposition , '<<<<<<', ChemicalComposition._unimod_parser)
            for k, v in unimodcomposition.items():
                self[k] += v
            # storage position related modifications
            position = int(match.group("pos"))
            if position == 0:
                # E.g. Acetylation at pos 0 indicates N-Term
                # but has to be counted for position 1 in this class
                position = 1

            if position not in self.composition_of_mod_at_pos.keys():
                self.composition_of_mod_at_pos[position] = ddict(int)
            if position not in self.composition_at_pos.keys():
                self.composition_at_pos[position] = ddict(int)
            for k, v in unimodcomposition.items():
                self.composition_of_mod_at_pos[position][k] += v
                self.composition_at_pos[position][k] += v

        return

    def use(self, sequence):
        """
        Re-initialize the class with a new sequence

        This is helpful if one ones to use the same class instance
        for multiple sequence since it remove class instantiation overhead.

        Args:
            sequence (str) - See top for possible input formats.

        Note:

            Will clear the current chemical composition dict!
        """

        self.clear()
        # reset the shiznit
        if "#" in sequence:
            # Unimod Style format
            if self._unimod_parser is None:
                self._unimod_parser = pyqms.UnimodMapper()
            self._parse_sequence_unimod_style(sequence)
        else:
            self._parse_sequence_old_style(sequence)

    def add_chemical_formula(self, chemical_formula):
        """
        Adds chemical formula to the instance

        Chemical formula can be a string or a dictionary with the element
        count.

        For example::

            chemical_formula = 'C18H36N9O18'
            chemical_formula = {
                'C' : 18,
                'H' : 36,
                'N' : 9,
                'O' : 18
            }

        """
        self._merge(chemical_formula, mode="addition")
        return

    def add_peptide(self, peptide):
        """
            Adds peptide sequence to the instance.

            Note:

                Only standard amino acids can be processed. If one uses special
                amino acids like (U of F) they have to be added to
                knowledge_base.py.

        """
        # pattern = self.regex_patterns['aaN']
        pattern = re.compile(r"""(?P<aa>[A-Z]{1})(?P<N>[0-9]*)""")
        # pattern = re.compile(r'[A-Z]{1}[0-9]*')
        number_offset = 0
        # this are the count for e.g. SILAC aa, i.e. R0 R1 or C0 and so on ...
        # print( peptide, type( peptide ))
        for aaN_match in pattern.finditer(peptide):
            aa = aaN_match.group("aa")
            N = aaN_match.group("N")
            pos = int(aaN_match.start()) - number_offset + 1
            if N != "":
                number_offset += len(N)
            try:
                aa_compo = self.aa_compositions[aa + N]
            except:
                sys.exit("Do not know aa composition for {0}".format(aa + N))
            self.add_chemical_formula(aa_compo)

            composition = self._chemical_formula_to_dict(aa_compo)
            self.composition_of_aa_at_pos[pos] = composition
            if pos not in self.composition_at_pos.keys():
                self.composition_at_pos[pos] = ddict(int)
            for k, v in composition.items():
                self.composition_at_pos[pos][k] += v

    def _chemical_formula_to_dict(self, chemical_formula):
        """
        Internal function to convert a chemical formula as string to a
        dictionary.
        """
        if isinstance(chemical_formula, pyqms.ChemicalComposition):
            chem_dict = chemical_formula
        else:
            unimod_style = False
            if "(" in chemical_formula:
                unimod_style = True
            chem_dict = {}
            # print( chemical_formula , type( chemical_formula ))
            # pattern = re.compile(r'(?P<element>[A-Z][a-z]*)(?P<count>[0-9]*)')
            if unimod_style:
                pattern = re.compile(
                    r"(?P<isotop>[0-9]*)(?P<element>[A-Z][a-z]*)\((?P<count>[0-9]*)\)"
                )
            else:
                pattern = re.compile(
                    r"(?P<element>[A-Z][a-z]*)(?P<count>[0-9]*)")
            for match in pattern.finditer(chemical_formula):
                if match.group("count") == "":
                    count = 1
                else:
                    count = int(match.group("count"))
                if unimod_style:
                    element_key = "{0}{1}".format(match.group("isotop"),
                                                  match.group("element"))
                else:
                    element_key = match.group("element")
                if element_key not in chem_dict.keys():
                    chem_dict[element_key] = 0
                chem_dict[element_key] += count
        return chem_dict

    def hill_notation(self, include_ones=False, cc=None):
        """
        Formats chemical composition into `Hill notation`_ string.

        .. _Hill Notation:
            https://en.wikipedia.org/wiki/Hill_system

        Args:
            cc (dict, optional): elemental composition dict

        Returns:
            str: Hill notation format of self.

                For example::

                    'C50H88N10O17'
        """
        MAJORS = ["C", "H"]
        s = ""
        if cc is None:
            cc_dict = self
        else:
            cc_dict = cc

        for major in MAJORS:
            if major in cc_dict.keys():
                if cc_dict[major] == 0:
                    continue
                s += major
                if include_ones or cc_dict[major] > 1:
                    s += str(cc_dict[major])
        for k in sorted(cc_dict.keys()):
            if k not in MAJORS:
                if cc_dict[k] == 0:
                    continue
                s += k
                if include_ones or cc_dict[k] > 1:
                    s += str(cc_dict[k])
        return s

    # def hill_notation(self, include_ones=False):
    def hill_notation_unimod(self, cc=None):
        """
        Formats chemical composition into `Hill notation`_ string
        adding `unimod`_ features.

        .. _Hill Notation:
            https://en.wikipedia.org/wiki/Hill_system

        .. _unimod:
            http://www.unimod.org/fields.html

        Args:
            cc (dict, optional): elemental composition dict

        Returns:
            str: Hill notation format including unimod format rules of self.

                For example::

                    'C(50)H(88)N(10)O(17)'
                    'C(50)H(88)14N(1)N(9)(17)'

        """
        MAJORS = ["C", "H"]
        s = ""
        if cc is None:
            cc_dict = self
        else:
            cc_dict = cc

        for major in MAJORS:
            if major in cc_dict.keys():
                if cc_dict[major] == 0:
                    continue
                s += "{0}({1})".format(
                    major.replace("(", "").replace(")", ""), cc_dict[major])
        for k in sorted(cc_dict.keys()):
            if k not in MAJORS:
                if cc_dict[k] == 0:
                    continue
                s += "{0}({1})".format(
                    k.replace("(", "").replace(")", ""), cc_dict[k])
        return s

    def _mass(self, cc=None):
        """
        Calculate the mass of the chemical composition.
        Optional cc can be specified, i.e. a cc dict in the style of
        { element : count , ... }

        This does however not work with enriched elements, e.g. 15N
        Rather use pyqms.isotopologue_library for more accurate mass
        calculations.
        """
        mass = 0
        if cc is None:
            cc_mass_dict = self
        else:
            cc_mass_dict = cc
        for element, count in cc_mass_dict.items():
            if element not in self.isotopic_distributions.keys():
                match = re.search(
                    '(?P<isotope>[0-9]*)(?P<element>[A-Z][a-z]*$)', element)
                for _emass, _distribution in self.isotopic_distributions[
                        match.group('element')]:
                    if int(round(_emass)) == int(match.group('isotope')):
                        emass = _emass
                        break
            else:
                emass = self.isotopic_distributions[element][0][0]
            mass += count * emass
        return mass

    def _merge(self, chemical_formula, mode="addition"):
        """
            Generalized function that allows addition and subtraction
        """
        if mode == "addition":
            sign = +1
        else:
            sign = -1
        if isinstance(chemical_formula, str):
            chemical_formula = self._chemical_formula_to_dict(chemical_formula)
        for element, count in chemical_formula.items():
            self[element] = self[element] + sign * count
        # else:
        #     print(chemical_formula, type(chemical_formula))
        #     sys.exit('Do not know the format of the chemical formula')
        return

    def subtract_peptide(self, peptide):
        """
            Subtracts peptide (chemical formula) from instance.
        """
        for aa in peptide:
            self.subtract_chemical_formula(self.aa_compositions[aa])

    def subtract_chemical_formula(self, chemical_formula):
        """
            Subtracts chemical formula from instance.
        """
        self._merge(chemical_formula, mode="subtraction")
        return

    def generate_cc_dict(self):
        tmp = {}
        tmp.update(self)
        return tmp