Beispiel #1
0
def get_search():
    '''Returns the search query for string matching'''

    if not defaults.DEFAULTS['search_case_sensitive']:
        return re.compile(defaults.DEFAULTS['search_form'], re.I)
    else:
        return re.compile(defaults.DEFAULTS['search_form'])
Beispiel #2
0
def get_search():
    '''Returns the search query for string matching'''

    if not defaults.DEFAULTS['search_case_sensitive']:
        return re.compile(defaults.DEFAULTS['search_form'], re.I)
    else:
        return re.compile(defaults.DEFAULTS['search_form'])
Beispiel #3
0
    def __init__(self):
        super(UniProtServerIterator, self).__init__()

        self.downloader = uniprot.GeneDownloader()

        if self.escape_filter:
            self.regex = re.compile(re.escape(self.filter))
        else:
            self.regex = re.compile(self.filter)
Beispiel #4
0
    def __init__(self):
        super(UniProtServerIterator, self).__init__()

        self.downloader = uniprot.GeneDownloader()

        if self.escape_filter:
            self.regex = re.compile(re.escape(self.filter))
        else:
            self.regex = re.compile(self.filter)
Beispiel #5
0
    def fromengine(cls, engine, start=False, end=False):
        '''Compiles the start and end subs and initializes the class'''

        startsub = engine.defaults.start
        if start:
            startsub = re.compile(startsub)

        endsub = engine.defaults.end
        if end:
            endsub = re.compile(endsub)
        return cls(startsub, endsub)
    def fromengine(cls, engine, start=False, end=False):
        '''Compiles the start and end subs and initializes the class'''

        startsub = engine.defaults.start
        if start:
            startsub = re.compile(startsub)

        endsub = engine.defaults.end
        if end:
            endsub = re.compile(endsub)
        return cls(startsub, endsub)
Beispiel #7
0
    def __init__(self, source):
        super(ParsePDCsv, self).__init__(source)

        logger.Logging.info("Initializing ParsePDSqlite3....")

        self.engine = get_engine(self.data)
        residues = ''.join(chemical_defs.AMINOACIDS)
        nterm, cterm = map(self.engine.get, ['nterm', 'cterm'])
        self.mod_parser = re.compile(self._mod.format(residues, nterm, cterm))
Beispiel #8
0
class ParseCsv(base.MatchedPeptideBase):
    '''Processes data from matched scans to dictionary from a file object'''

    # REGEXP
    # ------
    header = re.compile(HEADER)

    def __init__(self, row):
        super(ParseCsv, self).__init__(row)

        self.csv = csv_.CSVUtils(row)
        self.csv.process['modifications'] = self.getmodification
        self.modparser = modifications.ModificationParser(row)

    @logger.call('matched', 'debug')
    def __call__(self):
        '''
        Finds search name from Protein Prospector output,
        finds dataframe and dumps to dictionary of lists, where the
        keys are columns and lists values.
        '''

        self.setfileheader()
        header = self.row.engines['matched'].defaults.header - 1
        while header:
            self.fileobj.readline()
            header -= 1

        self.csv.set_reader(self.fileobj)
        self.csv()
        self.setids()

    #     SETTERS

    def setfileheader(self):
        '''Grabs the search and project names from the header line.'''

        match = self.header.match(self.fileobj.readline())
        self.row.data['attrs']['project'] = match.group(1).strip()
        self.row.data['attrs']['search'] = match.group(2).strip()

    #     GETTERS

    def getmodification(self, unparsed):
        '''Returns the parsed modification from the unparsed string'''

        peptide = self.row.data['matched']['peptide'][-1]
        start = self.row.data['matched']['start'][-1]

        # parse the modification
        if isinstance(unparsed, list):
            unparsed = unparsed[0]
        modification = Modification(unparsed, peptide, start)
        self.modparser(modification)

        return modification.todict()
Beispiel #9
0
    def __init__(self, fileobj, group, engine):
        super(ParseText, self).__init__()

        self.fileobj = fileobj
        self.group = group
        self.source = self.app.discovererthread
        self.scan_finder = scan_parser.ScanFinder.fromengine(engine)

        self._parser = getattr(self, PARSERS[engine.tostr()])
        self.re_scan = re.compile(engine.defaults.regexp)
Beispiel #10
0
class ColumnsDict(dict):
    '''Custom __getitem__ that removes suffixes'''

    # REGEXES
    # -------
    suffix = re.compile(r'^.* (?:[A-Z0-9])$')
    whitespace = re.compile(r'\s+')

    #      MAGIC

    def __getitem__(self, key, dict_getitem=dict.__getitem__):
        '''CD[k] -> v'''

        key = self._keychecker(key)
        return dict_getitem(self, key)

    #     HELPERS

    def _keychecker(self, item):
        '''Returns the suffix-less key'''

        if isinstance(item, six.string_types):
            return self._stringchecker(item)
        elif isinstance(item, (list, tuple)):
            return self._sequencechecker(item)

    def _stringchecker(self, item):
        '''Returns the suffix-less key from a string column'''

        if self.suffix.match(item):
            return item[:-2]
        return item

    def _sequencechecker(self, item):
        '''Returns the suffix-less key from a column sequence'''

        # comparative has null string columns in row 1, while quantitative
        # has null or isotope columns in row 0
        string = [i for i in item if self.whitespace.sub("", i)][-1]
        if self.suffix.match(string):
            return string[:-2]
        return string
Beispiel #11
0
    def __init__(self, row, group, fileobj):
        super(ParseFullms, self).__init__()

        self.row = row
        self.group = group
        self.fileobj = fileobj

        engine = row.engines['ms1']
        self.extractor = scans.ChromatogramExtractor(row)
        self.scan_finder = scan_parser.ScanFinder.fromengine(engine, end=True)

        self._parser = getattr(self, PARSERS[engine.tostr()])
        self.re_scan = re.compile(engine.defaults.regexp)
Beispiel #12
0
    def __init__(self, row, group, fileobj):
        super(ParseFullms, self).__init__()

        self.row = row
        self.group = group
        self.fileobj = fileobj

        engine = row.engines['ms1']
        self.extractor = scans.ChromatogramExtractor(row)
        self.scan_finder = scan_parser.ScanFinder.fromengine(engine, end=True)

        self._parser = getattr(self, PARSERS[engine.tostr()])
        self.re_scan = re.compile(engine.defaults.regexp)
Beispiel #13
0
# LICENSE=Licensed to: ---
# MP=
# NM=
# COM=10ftmol BSA
# IATOL=
# IA2TOL=
# IASTOL=
# IBTOL=
# IB2TOL=
# IBSTOL=
# IYTOL=
# IY2TOL=

# REGEXP
# ------
CONSTANT_RESIDUES = re.compile(r'C_term|N_term|[A-Z]')
FIXEDMOD = re.compile(r'^FixedMod\d*$')
FIXMOD_RESIDUES = re.compile(r'^FixedModResidues\d*$')

# CONSTANTS
# ---------
TERMINI = {'C_term': 'cterm', 'N_term': 'nterm'}

# HELPERS
# -------


@logger.init('matched', 'DEBUG')
class Hits(base.BaseObject):
    '''Convenience class to facilitate adding Mascot search hits to a scan'''
    def __init__(self, modifications, queries):
Beispiel #14
0
class MSFParser(SQLiteUtils):
    '''
    Provides convenient methods to parse the MSF file format, which is
    a simple, readable SQLITE3 format.
    '''

    _fraction = None
    _path_sep = re.compile(r'\\|/')

    def __init__(self, parent):
        super(MSFParser, self).__init__()

        self.data = parent.data
        self.engine = get_engine(self.data)
        self.source = parent.source
        self.fragments = parent.fragments

        self.database = sqlite3.connect(parent.fileobj.name)
        self.cursor = self.database.cursor()

        self.peptide_to_spectrum = {}
        self.peptides = mapping.OrderedRecursiveDict()
        self.proteins = {}

        # all mods are defined internally within the file
        mods = ProteomeDiscovererMods(self.cursor, self.engine)
        # need to convert to the namedtuple format
        params.tupleize(mods)
        self.mod_ids = mods.ids
        self.engine['mods'].update(mods)

    def run(self):
        '''On start'''

        self.init_spectrum()

        self.fetch_enzyme()
        self.set_file()
        self.add_spectrum()
        self.add_scores()
        self.add_mods()
        self.add_ppms()
        self.set_proteins()
        self.add_proteins()

    # ------------------
    #       MAIN
    # ------------------

    def init_spectrum(self):
        '''Sets the peptides with a SpectrumID as a key reference'''

        columns = "PeptideID, SpectrumID, Sequence, SearchEngineRank"
        self.cursor.execute("SELECT {} FROM Peptides;".format(columns))
        # Spectrum ID is the unique spectral identifier
        # Peptide ID is the unique peptide identifier
        # Sequence is the peptide sequence, ex: "KATNE"
        for peptide_id, spectrum_id, sequence, rank in self.cursor:
            self.peptide_to_spectrum[peptide_id] = spectrum_id
            entry = self.peptides[spectrum_id][peptide_id]
            entry['peptide'] = sequence
            entry['rank'] = rank

    def fetch_enzyme(self):
        '''Extracts the proteolytic enzyme for the peptide search'''

        columns = "ParameterName, ParameterValue"
        table = "ProcessingNodeParameters"
        items = {k: v for k, v in self.fetch("fetchall", columns, table)}
        try:
            self.data['enzyme'] = items['Enzyme']
        except KeyError:
            # no enzyme defined
            pass

    def set_file(self):
        '''Sets the current file name uses from the RAW file'''

        files = self.fetch("fetchall", "FileName", "FileInfos")
        # joins consecutive file names together, FileInfos should be 1
        # use regex path splitter, since Windows path mappings don't work
        # on Linux or OS X.
        files = (self._path_sep.split(i[0])[-1] for i in files)
        self._fraction = ' - '.join(files)

        for entry in self.peptides.values():
            entry['fraction'] = self._fraction

    def add_spectrum(self):
        '''Adds the spectral data, which includes the m/zs and charge states'''

        columns = "SpectrumID, ScanNumbers, Charge, Mass"
        self.cursor.execute("SELECT {} FROM SpectrumHeaders;".format(columns))
        # Spectrum ID is the unique spectral identifier
        # num is the scan number associated with the spectral ID
        # charge is the charge state of the peptide
        # mass is the singly-charged mass of the peptide
        for spectrum_id, num, charge, mass in self.cursor:
            # peptide_ids = self.spectrum[spectrum_id]

            self.peptides[spectrum_id]['num'] = num
            self.peptides[spectrum_id]['z'] = charge
            # mz = masstools.mz(mass - params.PROTON_MASS, charge, 0)
            self.peptides[spectrum_id]['m/z'] = mz

    def add_scores(self):
        '''
        Adds the score and calculates the EV from the score
        :
            score == -10*log(ev, 10)
        '''

        columns = "PeptideID, ScoreValue"
        self.cursor.execute("SELECT {} FROM PeptideScores;".format(columns))
        # Peptide ID is the unique peptide identifier, which
        # is for each peptide hit from the spectral ID
        # Score is the - 10 * log(p-value, 10)
        for peptide_id, score in self.cursor:
            expect = 10**(-score / 10)
            spectrum_id = self.peptide_to_spectrum[peptide_id]

            entry = self.peptides[spectrum_id][peptide_id]
            entry['score'] = score
            # need to calculate the p-value, or expectation value
            entry['ev'] = expect

    def add_mods(self):
        '''Finds all the target mods from the IDs and adds them to a holder'''

        self._mod_templates()
        self._internal_mods()
        self._terminal_mods()

    def add_ppms(self):
        '''Calculates the PPMs from the mods and peptides for each entry'''

        keys = ['peptide', 'm/z', 'z']
        for peptide_id, spectrum_id in self.peptide_to_spectrum.items():
            entry = self.peptides[spectrum_id]
            hit = entry[peptide_id]
            mod = hit['mods']
            peptide, exper, charge = map(entry.get, keys)

            hit['formula'] = formula = self.calculate_formula(peptide, mod)
            hit['ppm'] = self.calculate_ppm(formula, mod, exper, charge)

    def set_proteins(self):
        '''Creates a {ProteinID: (UniProt ID: Protein Name)} holder'''

        columns = 'ProteinID, Description'
        table = "ProteinAnnotations"
        self.cursor.execute("SELECT {0} FROM {1};".format(columns, table))

        for protein_id, description in self.cursor:
            # description == '>sp|P62894|CYC_BOVIN Cytochrome ...'
            id_, name = description.split('|')[1:]
            self.proteins[protein_id] = (id_, name)

    def add_proteins(self):
        '''Adds the protein names and IDs to each entry'''

        columns = "PeptideID, ProteinID"
        self.cursor.execute("SELECT {} FROM PeptidesProteins;".format(columns))

        for peptide_id, protein_id in self.cursor:
            spectrum_id = self.peptide_to_spectrum[peptide_id]
            entry = self.peptides[spectrum_id][peptide_id]
            id_, name = self.proteins[protein_id]

            entry['id'] = id_
            entry['name'] = name

    # ------------------
    #      UTILS
    # ------------------

    #      MODS

    def _internal_mods(self):
        '''Adds all the internal modifications to the mod holders'''

        columns = "PeptideID, AminoAcidModificationID, Position"
        table = "PeptidesAminoAcidModifications"
        self.cursor.execute("SELECT {} FROM {};".format(columns, table))

        for peptide_id, mod_id, position in self.cursor:
            spectrum_id = self.peptide_to_spectrum.get(peptide_id)
            modname = self.mod_ids[mod_id]

            # for some weird reason, the mods can have peptide IDs which don't
            # exist otherwise, causing errors. Not decoys, nothing.
            if spectrum_id is not None:
                mods = self.peptides[spectrum_id][peptide_id]['mods']
                mods['certain'].setdefault(modname, [])
                mods['certain'][modname].append(position)

    def _terminal_mods(self):
        '''Adds all the N-/C-terminal modifications to the mod holders'''

        columns = "PeptideID, TerminalModificationID"
        table = "PeptidesTerminalModifications"
        self.cursor.execute("SELECT {} FROM {};".format(columns, table))
        for peptide_id, mod_id in self.cursor:
            spectrum_id = self.peptide_to_spectrum.get(peptide_id)

            # for some weird reason, the mods can have peptide IDs which don't
            # exist otherwise, causing errors. Not decoys, nothing.
            if spectrum_id is not None:
                mods = self.peptides[spectrum_id][peptide_id]['mods']

                modname = self.mod_ids[mod_id]
                mods['certain'].setdefault(modname, [])
                if self.engine['nterm'] in self.engine['mods'][modname][1]:
                    mods['certain'][modname].append(self.engine['nterm'])

                else:
                    mods['certain'][modname].append(self.engine['cterm'])

    def _mod_templates(self):
        '''Sets the mod template holders for each peptide ID'''

        template = params.TEMPLATES['mods']
        for peptide_id, spectrum_id in self.peptide_to_spectrum.items():
            hit = self.peptides[spectrum_id][peptide_id]
            hit.setdefault('mods', copy.deepcopy(template))
Beispiel #15
0
# CONSTANTS
# ---------
REPLACE = '[REPLACE]'
SKYLINE = r'\[CROSSLINKER(\+-?[0-9]+\.[0-9])?\]'

CROSSLINKER_POSITIVE = '[CROSSLINKER+{0}]'
CROSSLINKER_NEGATIVE = '[CROSSLINKER{0}]'
CROSSLINKER = '[CROSSLINKER]'

SKYLINE_POSITIVE = '[+{0}]'
SKYLINE_NEGATIVE = '[-{0}]'

# REGEXES
# -------
PARENTHESES = re.compile(r'\(|\)')
LETTERS = re.compile('([A-Z]{2})')
MASS_MODIFICATION = re.compile(r'(\[(?:\+|-)([0-9]*\.?[0-9]?)\])')


# HELPERS
# -------


def mappedresidues(deadends):
    '''Maps the CSV residue keys as values to each individual residue'''

    mapped = defaultdict(list)
    for key in deadends:
        for residue in key.split(','):
            mapped[residue].append(key)
Beispiel #16
0
class PeptideDatabase(PeptideDBSettings, CutSites, AddMods, HDF5Utils):
    '''
    Creates a custom peptide database with various settings from 1- max
    crosslinker modifications for each permutation, and the number of
    mods up to the max (with XL mods) defined by the search.
    '''

    db_modes = {True: {'standard', 'decoy'}, False: {'standard'}}
    peptides = None
    searchables = None
    mods_length = None
    modifications_dtype = None
    mod_ids = None
    _mode = None

    id_regex = re.compile(uniprot.SERVER['id']['regex'], re.IGNORECASE)
    entry_regex = re.compile(uniprot.SERVER['entry']['regex'], re.IGNORECASE)

    def __init__(self, grp, xler, source):
        super(PeptideDatabase, self).__init__()

        self.grp = grp
        self.xler = xler
        self.source = source
        self.mods = params.CUSTOM_MODS
        self.react = set(self.xler['react_sites'])
        # default to true, newly set value
        self.uncleaved = self.xler.get("uncleaved", True)
        self.fragment_masses = self.get_fragment_masses()
        self.basemods = self.organize_basemods()
        self._set_mod_ids()

        self.sequences = self._get_sequences()

        self.decoy = params.MASS_FINGERPRINT['search_decoys']
        self._peptide_holders()

        self.run()

    def run(self):
        '''On start'''

        self._mode = 'standard'
        self.add_ids()
        self.make_searchables()
        if self.decoy:
            self._mode = 'decoy'
            self.make_searchables()

        for key in {'base_peptides', 'peptides'}:
            del self.grp[key]
        self.linearize()

    # ------------------
    #       PUBLIC
    # ------------------

    def add_ids(self):
        '''Adds a way to map the sequence ids to the current holder'''

        ids = [i.encode('utf-8') for i in self.sequences]
        self.grp.attrs.create('protein_ids', data=ids)
        names = [i.name.encode('utf-8') for i in self.sequences.values()]
        self.grp.attrs.create('protein_names', data=names)

    def make_searchables(self):
        '''On start'''

        self.cut_sequences()
        self.add_mods()

    # ------------------
    #   PRIVATE -- INIT
    # ------------------

    def _get_sequences(self):
        '''Updates sequence dictionary with entries from the limited db'''

        sequences = {}
        limited_db = self.source.limited_database
        for id_ in limited_db:
            if id_ in self.source.custom_proteins:
                sequences[id_] = self.source.custom_proteins[id_]
            elif id_ in self.source.gene_name:
                sequences[id_] = self.source.gene_name[id_]

        return sequences

    def _peptide_holders(self):
        '''Adds peptide holders for later stimulated cutting'''

        for key in {'base_peptides', 'decoy', 'standard'}:
            self.grp.create_group(key)

        peptides = self.grp.create_group('peptides')
        db_keys = self.db_modes[self.decoy]
        for key in db_keys:
            for missed_cleavages in range(self.max_missed + 1):
                peptides.create_group('{}/{}'.format(key, missed_cleavages))

        # temporary, fast data holder for in memory searchables
        self.searchables = {}
        for key in db_keys:
            self.searchables[key] = defaultdict(list)

    def _set_mod_ids(self):
        '''
        Assigns unique mod ids for each modification and stores a copy
        in the dataset.
        '''

        names = [i[0] for item in self.basemods.values() for i in item]
        names += self.xler['fragments']['name']
        # grabs the number of places required to store data
        self.mods_length = (len(names) // 10) + 1
        # +2 for n and c-term
        max_mods = self.mods_length * self.max_length + 2
        self.modifications_dtype = 'S{}'.format(max_mods)

        self.mod_ids = {}
        for index, name in enumerate(names):
            self.mod_ids[name] = index

        # store as attrs to unpack later
        self.grp.attrs.create('modification_ids', data=range(len(names)))
        bin_names = [i.encode('utf-8') for i in names]
        self.grp.attrs.create('modification', data=bin_names)
Beispiel #17
0
# MP=
# NM=
# COM=10ftmol BSA
# IATOL=
# IA2TOL=
# IASTOL=
# IBTOL=
# IB2TOL=
# IBSTOL=
# IYTOL=
# IY2TOL=


# REGEXP
# ------
CONSTANT_RESIDUES = re.compile(r'C_term|N_term|[A-Z]')
FIXEDMOD = re.compile(r'^FixedMod\d*$')
FIXMOD_RESIDUES = re.compile(r'^FixedModResidues\d*$')

# CONSTANTS
# ---------
TERMINI = {
    'C_term': 'cterm',
    'N_term': 'nterm'
}


# HELPERS
# -------

'''

# load modules
from xldlib import exception
from xldlib.definitions import re, ZIP
from xldlib.objects import matched
from xldlib.qt.objects import base
from xldlib.utils import logger

from . import hierarchical


# REGEXES
# -------

NUMBER = re.compile(r'-?[0-9]*\.?[0-9]+')


# HELPERS
# -------


@logger.init('matched', 'DEBUG')
class CheckTermini(base.BaseObject):
    '''Helper class which identifies false modification termini'''

    def __init__(self, engine):
        super(CheckTermini, self).__init__()

        self.nterm = engine.defaults.nterm
        self.engine_modifications = engine.defaults.modifications
from xldlib.definitions import re

# load objects/functions
from collections import defaultdict


# CONSTANTS
# ---------

HEADER_CHARACTERS = 3

# REGEXP
# ------

FIRST_CAP = re.compile('(.)([A-Z][a-z]+)')
ALL_CAP = re.compile('([a-z0-9])([A-Z])')


# REGISTER
# --------
# Item register to avoid malicious, external script execution

REGISTER = defaultdict(set)
NAME_REGISTER = {}

# DATA
#----


BUILTINS = {
Beispiel #20
0
    (comparable to BioPython).

    :copyright: (c) 2015 The Regents of the University of California.
    :license: GNU GPL, see licenses/GNU GPLv3.txt for more details.
'''

# load modules
from xldlib import exception
from xldlib.definitions import re
from xldlib.utils import logger
from xldlib.utils.io_ import high_level, ziptools

# REGEXP
# ------

ISOFORM = re.compile(r'^(.*)\.\d$')
HYPHEN = re.compile(r'-')
ASTERIX = re.compile(r'\*')

# OBJECTS
# -------


class FastaParserMixin(object):
    '''
    Mixin to provide methods to parse FASTA records using
    specific identifiers.
    '''

    #   NON-PUBLIC
Beispiel #21
0
    Processing Mascot modnames depending on the UniMod specification.

    :copyright: (c) 2015 The Regents of the University of California.
    :license: GNU GPL, see licenses/GNU GPLv3.txt for more details.
'''

# load modules/submodules
from xldlib import resources
from xldlib.definitions import re
from xldlib.qt.objects import base

# REGEXES
# -------

PARSERS = [re.compile(i.regexp) for i in resources.SCAN_TITLES]

# MATCHER
# -------


class TitleFormatter(base.BaseObject):
    '''
    Identify the scan title format based on regex matches, otherwise
    raise an `AssertionError`. After identifying the title format,
    use the title formatter to extract the scan number from
    scan filters.
    '''
    def __init__(self):
        super(TitleFormatter, self).__init__()
Beispiel #22
0
class ChemicalParserMixin(object):
    '''Mixin for parsing chemical formulas and updating a mapping object'''

    # REGEX
    # -----
    chemical = re.compile(r'^{}$'.format(CHEMICAL))
    monomer = re.compile(r'^{}$'.format(MONOMER))
    atom = re.compile(r'[A-Z][a-z]?')

    #      PUBLIC

    def update_chemical(self, formula, count):
        '''
        Update atomic counts from a string or mapping `formula`, and
        multiple by `count`.

        Args:
            formula (string, mapping):  chemical or glycan formula
            count (int):                scalar for elemental counts
        '''

        if isinstance(formula, Mapping):
            self._update_mapping(formula, count)
        elif isinstance(formula, six.string_types):
            self._update_str(formula, count)

    #    NON-PUBLIC

    def _update_mapping(self, formula, count):
        '''
        Update the elemental counts from a mapping `formula` multiplied
        by a scalar `count`. See `update_chemical` for full arg specs.
        '''

        for symbol, isotopes in formula.items():
            for isotope, number in isotopes.items():
                self[symbol][isotope] += number * count

    def _update_str(self, formula, count):
        '''
        Update the elemental counts from a str `formula` multiplied
        by a scalar `count`. See `update_chemical` for full arg specs.
        '''

        for item in formula.split():
            for symbol, isotope, number in self._parse(item):
                self[symbol][isotope] += number * count

    def _parse(self, item):
        '''
        Extract atomic symbol, isotope, and atomic counts from
        the `item` string.

        Args:
            item (str):  str in format of "13C6", "13C(6)", "Hex", "C6"
        '''

        match = self.chemical.match(item)
        if match:
            return [self._parse_element(match)]
        else:
            return self._parse_monomer(self.monomer.match(item))

    def _parse_element(self, match):
        '''Extract elemental data from re `match` group'''

        isotope, symbol, free, parentheses = match.groups()

        assert self._symbolchecker(symbol)
        isotope = int(isotope or -1)
        count = _element_count(free, parentheses)

        return symbol, isotope, count

    def _parse_monomer(self, match):
        '''Extract glycan monomer data from re `match` group'''

        monomer, free, parentheses = match.groups()
        count = _element_count(free, parentheses)
        formula = MONOMERS[monomer]

        for item in formula.split():
            match = self.chemical.match(item)
            symbol, isotope, number = self._parse_element(match)
            yield symbol, isotope, number * count

    #    HELPERS

    def _symbolchecker(self, symbol):
        '''Check validity of atomic `symbol`'''

        return self.atom.match(symbol)
Beispiel #23
0
    def getproteinmods(self, modification, start):
        '''Gets the protein mods from the manually specified data'''

        if defaults.DEFAULTS['concatenate_hybrid_modifications']:
            modification = modification.concatenate()

        positions = list(self.getcertain(modification, start))
        if modification['uncertain']:
            positions.append(self.getuncertain(modification, start))

        return positions


# REGEXP
# ------
LETTERS = re.compile('([A-Z]{2})')
PARENTHESES = re.compile(r'(\))')

# STRINGS
# -------
TERMINUS = '{0}-{1}'
INTERNAL = '{0}({1}){2}'

# PEPTIDE-EMBEDDED MODIFICATIONS
# ------------------------------


@logger.init('spreadsheet', 'DEBUG')
class ModificationsInPeptide(base.BaseObject):
    '''Adds the given user mods to the target peptide sequence'''
Beispiel #24
0
from xldlib.utils import decorators, logger, xictools

__all__ = [
    'Amplitudes', 'Dataframe', 'HierarchicalDataframe', 'QuantitativeDataframe'
]

# CONSTANTS
# ---------

LOWER_SIGMA = u'\u03C3'
UPPER_SIGMA = u'\u03A3'

# REGEXES
# -------

NONQUANTIFIED = re.compile(u'<|>|-|{}'.format(xictools.INFINITY), re.UNICODE)

# DATA
# ----

CONCATENATED = {
    'report',
    'best_peptide',
    'best_peptide_file',
}

POLYPEPTIDE = {
    reports.LINKTYPES['interlink'],
    reports.LINKTYPES['multilink'],
}
Beispiel #25
0
    Processing Mascot modnames depending on the UniMod specification.

    :copyright: (c) 2015 The Regents of the University of California.
    :license: GNU GPL, see licenses/GNU GPLv3.txt for more details.
'''

# load modules/submodules
from xldlib import resources
from xldlib.definitions import re
from xldlib.qt.objects import base


# REGEXES
# -------

PARSERS = [re.compile(i.regexp) for i in resources.SCAN_TITLES]


# MATCHER
# -------


class TitleFormatter(base.BaseObject):
    '''
    Identify the scan title format based on regex matches, otherwise
    raise an `AssertionError`. After identifying the title format,
    use the title formatter to extract the scan number from
    scan filters.
    '''

    def __init__(self):
Beispiel #26
0
from xldlib.definitions import re, ZIP
from xldlib.qt.objects import base
from xldlib.resources.parameters import defaults
from xldlib.utils import decorators, logger
from xldlib.xlpy.tools import peak_picking

# load objects/functions
from collections import namedtuple

# OBJECTS
# -------
BinaryData = namedtuple("BinaryData", "data precision compression")

# REGEXP
# ------
MZML_SCAN = re.compile(r'scan=([0-9]+)')

# HELPERS
# -------


@logger.init('scans', level='DEBUG')
class Start(base.BaseObject):
    '''Utilities for processing data from XML start elements'''
    def __init__(self, group):
        super(Start, self).__init__()

        self.group = group
        self.source = self.app.discovererthread

    def spectrum(self, attrs):
Beispiel #27
0
class ScientificSpinBox(DoubleSpinBox):
    '''
    Recipe for a QDoubleSpinBox with support for floating point
    numbers in scientific notation.
    '''

    # VALIDATION
    # ----------
    float_regex = re.compile(SCIENTIFIC)
    precision = 1

    def __init__(self, parent=None, **kwds):
        super(ScientificSpinBox, self).__init__(parent, **kwds)

        self.validator = QtGui.QDoubleValidator()
        self.validator.setNotation(QtGui.QDoubleValidator.ScientificNotation)

        self._string = '{{:0.{num}e}}'.format(num=self.precision)
        self.formatter = self._string.format

    #  PUBLIC FUNCTIONS

    def validate(self, text, position):
        return self.validator.validate(text, position)

    def fixup(self, text):
        return self.tostr(self.tofloat(text))

    def valueFromText(self, text):
        return self.tofloat(text)

    def textFromValue(self, value):
        return self.tostr(value)

    def stepBy(self, steps):
        value = self.value() + steps * self.singleStep()
        self.lineEdit().setText(self.formatter(value))

    #    HELPERS

    def tofloat(self, text, default=0.):
        '''Returns a floating-point representation from text'''

        match = self.float_regex.match(text)
        if match is not None:
            return self.frommatch(match)
        return default

    def tostr(self, value):
        return self.formatter(value)

    @staticmethod
    def frommatch(match):
        '''Converts a match group to a floating point representation'''

        sig, sign, exp = match.groups()
        if sign is None:
            sign = '+'
        if exp is None:
            exp = '0'
        return float(sig) * (10**int(sign + exp))
Beispiel #28
0
    'ID_REGEX',
    'MNEMONIC_REGEX'
]

# CONSTANTS
# ---------

ID = (r'[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]'
      r'([A-Z][A-Z0-9]{2}[0-9]){1,2}')
MNEMONIC = r'[a-zA-Z0-9]{1,5}_[a-zA-Z0-9]{1,5}'


# REGEXP
# ------

ID_REGEX = re.compile(ID, re.IGNORECASE)
MNEMONIC_REGEX = re.compile(MNEMONIC, re.IGNORECASE)


GIT_SERVER = {
    'domain': {
        'repos': 'repos',
        'tags': ['tags'],
        'releases': ['releases'],
        'assets': ['releases', 'assets'],
        'owner': 'Alexhuszagh',
        'repo': 'xlDiscoverer'
    },
    'protocol': 'https',
    'scheme': '://',
    'host': 'api.github.com',
Beispiel #29
0
from xldlib.resources.parameters import defaults
from xldlib.utils import decorators, logger
from xldlib.xlpy.tools import peak_picking

# load objects/functions
from collections import namedtuple


# OBJECTS
# -------
BinaryData = namedtuple("BinaryData", "data precision compression")


# REGEXP
# ------
MZML_SCAN = re.compile(r'scan=([0-9]+)')


# HELPERS
# -------


@logger.init('scans', level='DEBUG')
class Start(base.BaseObject):
    '''Utilities for processing data from XML start elements'''

    def __init__(self, group):
        super(Start, self).__init__()

        self.group = group
        self.source = self.app.discovererthread
Beispiel #30
0
class ProteomeDiscovererMods(mapping.MethodlessCopyDict, SQLiteUtils):
    '''
    Processes the SQLITE3 mod objects to produce a mod dictionary
    :
        3 SQLITE3 tables -> {modname: formula: res_csv}
    '''

    _attrs = {
        'identifiers': ("AminoAcidModificationID, AminoAcidID",
                        "AminoAcidModificationsAminoAcids"),
        'mods':
        ("AminoAcidModificationID, ModificationName, "
         "Abbreviation, Substitution, LeavingGroup", "AminoAcidModifications"),
        'aminoacids':
        ("AminoAcidID, AminoAcidName, OneLetterCode", "AminoAcids"),
    }
    formula = re.compile(r'\(|\)')

    def __init__(self, cursor, engine):
        super(ProteomeDiscovererMods, self).__init__()

        self.cursor = cursor
        self.engine = engine
        self._terms = {self.engine['nterm'], self.engine['cterm']}

        for attr, (column, table) in self._attrs.items():
            setattr(self, attr, self.fetch("fetchall", column, table))

        self.aminoacids = {i[0]: i[1:] for i in self.aminoacids}
        self.mods = {i[0]: i[1:] for i in self.mods}

        self.ids = {}

        self.add_standard()
        self.add_terminal()
        self.process_residues()

    # ------------------
    #       MAIN
    # ------------------

    def add_standard(self):
        '''Adds the standard mods which do not have a "terminal" ID'''

        for mod_id, aminoacid_id in self.identifiers:
            modname, abbrev, string_formula, leaving_group = self.mods[mod_id]
            if 'Mascot' in modname:
                continue

            self.ids[mod_id] = modname
            formula = self._get_formula(string_formula, modname, leaving_group)
            if formula is None:
                continue
            holder = [formula, set()]
            self.setdefault(modname, holder)
            self.setdefault(abbrev, holder)

            residue = self._get_residue(aminoacid_id)
            self[modname][1].add(residue)

    def add_terminal(self):
        '''Adds the mods which are terminal only'''

        standard_ids = set(i[0] for i in self.identifiers)
        terminal_ids = set(self.mods).difference(standard_ids)
        for mod_id in terminal_ids:
            modname, abbrev = self.mods[mod_id][0:2]
            self.ids[mod_id] = modname

            self.setdefault(modname, TERMINAL_MODS[modname])
            self.setdefault(abbrev, TERMINAL_MODS[modname])

    def process_residues(self):
        '''Processes the residues from a set to the CSV format'''

        # memoize to avoid double processing the same holder from the
        # modname and the abbrev
        memo = set()
        for values in self.values():
            id_ = id(values)

            if id_ not in memo:
                residues = ','.join(sorted(values[1]))
                values[1] = residues
                memo.add(id_)

    # ------------------
    #      UTILS
    # ------------------

    def _get_formula(self, string_formula, modname, leaving_group):
        '''
        Produces the net mod formula, with the addition or overall formula
        defined by string_formula and the loss by the leaving_group.
        If no formula is defined or is a monomer formula (not chemical),
        an AssertionError or AttributeError is defined and the formula
        is attempted to be solved via .mods.MONOMERS.
        '''

        try:
            assert string_formula
            string_formula = self.formula.sub('', string_formula)
            formula = chemical.Molecule(string_formula)
            formula.update_formula(leaving_group, count=-1)
            formula = formula.tostr()

        except (AssertionError, AttributeError):
            formula = MONOMERS.get(modname)
            if formula is None:
                print(exception.CODES['024'].format(modname), file=sys.stderr)
        return formula

    def _get_residue(self, aminoacid_id):
        '''Returns the residue name based on whether N-/C-term or internal'''

        aminoacid_name, one_letter = self.aminoacids[aminoacid_id]

        if aminoacid_name in self._terms:
            residue = aminoacid_name
        else:
            residue = one_letter

        return residue
Beispiel #31
0
    'HierarchicalDataframe',
    'QuantitativeDataframe'
]


# CONSTANTS
# ---------

LOWER_SIGMA = u'\u03C3'
UPPER_SIGMA = u'\u03A3'


# REGEXES
# -------

NONQUANTIFIED = re.compile(u'<|>|-|{}'.format(xictools.INFINITY), re.UNICODE)


# DATA
# ----

CONCATENATED = {
    'report',
    'best_peptide',
    'best_peptide_file',
}

POLYPEPTIDE = {
    reports.LINKTYPES['interlink'],
    reports.LINKTYPES['multilink'],
}
Beispiel #32
0
    def __init__(self, enzyme=None):
        super(ProteolyticEnzyme, self).__init__()

        self.enzyme = self._enzymechecker(enzyme)
        self.cut_regex = re.compile(self.enzyme.cut_regex)
            Solution: Same as above, only with 'D,E' rather than 'K'
'''

# load modules
from xldlib import exception
from xldlib.definitions import re, ZIP
from xldlib.objects import matched
from xldlib.qt.objects import base
from xldlib.utils import logger

from . import hierarchical

# REGEXES
# -------

NUMBER = re.compile(r'-?[0-9]*\.?[0-9]+')

# HELPERS
# -------


@logger.init('matched', 'DEBUG')
class CheckTermini(base.BaseObject):
    '''Helper class which identifies false modification termini'''
    def __init__(self, engine):
        super(CheckTermini, self).__init__()

        self.nterm = engine.defaults.nterm
        self.engine_modifications = engine.defaults.modifications

        source = self.app.discovererthread
Beispiel #34
0
    'bzip2',
    'gz',
    'hdf5',
    'mime',
    'pkzip',
    'raw',
    'seek_start',
    'sqlite',
    'tar',
    'xml',
]

# REGEX
# -----

XML_DECLARATION = re.compile(r'<\?xml version="\d\.\d" encoding=".+"\?>\r?\n')
XML_FORMAT = re.compile(r'^\s*<\w+ xmlns=')
MIME_DELARATION = re.compile('MIME-Version: .+')

# HELPERS
# -------


@contextlib.contextmanager
def seek_start(fileobj):
    '''
    Context manager which seeks the fileobj start,
    yields the fileobj, and then re-seeks the object
    start to allow sequential reads for file-format
    determination to leave the fileobj start position
    unchanged.
Beispiel #35
0
from xldlib.definitions import re, ZIP
from xldlib.qt.objects import base
from xldlib.resources.parameters import defaults
from xldlib.utils import decorators, logger
from xldlib.xlpy.tools import peak_picking


# OBJECTS
# -------
BinaryData = namedtuple("BinaryData", "data precision compression byteorder")


# REGEXP
# ------
MZXML_RT = re.compile(r'^PT((\d*\.?\d*)M)?((\d*\.?\d*)S)?$')


# HELPERS
# -------


@logger.init('scans', level='DEBUG')
class Start(base.BaseObject):
    '''Utilities for processing data from XML start elements'''

    def __init__(self, group):
        super(Start, self).__init__()

        self.source = self.app.discovererthread
Beispiel #36
0
# load modules/submodules
from xldlib.definitions import re

__all__ = ['GIT_SERVER', 'ID_REGEX', 'MNEMONIC_REGEX']

# CONSTANTS
# ---------

ID = (r'[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]'
      r'([A-Z][A-Z0-9]{2}[0-9]){1,2}')
MNEMONIC = r'[a-zA-Z0-9]{1,5}_[a-zA-Z0-9]{1,5}'

# REGEXP
# ------

ID_REGEX = re.compile(ID, re.IGNORECASE)
MNEMONIC_REGEX = re.compile(MNEMONIC, re.IGNORECASE)

GIT_SERVER = {
    'domain': {
        'repos': 'repos',
        'tags': ['tags'],
        'releases': ['releases'],
        'assets': ['releases', 'assets'],
        'owner': 'Alexhuszagh',
        'repo': 'xlDiscoverer'
    },
    'protocol': 'https',
    'scheme': '://',
    'host': 'api.github.com',
    'path': '/',