Exemple #1
0
    def _parseAC(self, line: str):
        accessions = Parser.AC_RE.findall(line)

        # only once, even if there are multiple AC lines:
        if self.record is None:
            # ensure a species ID has to be set later:
            #noinspection PyTypeChecker
            self.record = ProteinRecord(-1, length=self._length)
            self.db_key = DBRef(Namespace.uniprot, accessions[0])

            if self._id:
                self.record.addSymbol(self._id)

            self.record.addDBRef(self.db_key)

        for acc in accessions:
            self.record.addString("accession", acc)

        return 0
Exemple #2
0
class Parser(AbstractLoader):
    """
    A parser for UniProtKB text files.

    Implements the `AbstractParser._parse` method.
    """

    def _setup(self, stream: io.TextIOWrapper) -> int:
        lines = super(Parser, self)._setup(stream)
        self.db_key = None
        self.record = None
        self._id = ''
        self._length = None
        self._name_cat = None
        self._name_state = None
        self._skip_sequence = False

        # Set up a dispatcher pattern for parsing lines given the line
        # type, which is defined by the first two letters on the line:
        self._dispatcher = {
            "ID": self._parseID, "AC": self._parseAC, "DE": self._parseDE,
            "GN": self._parseGN, "OX": self._parseOX, "RX": self._parseRX,
            "DR": self._parseDR, "KW": self._parseKW, "SQ": self._parseSQ,
            "//": self._parseEND,  # "DT": self._parseDT,
            "OS": skip, "OG": skip, "OC": skip, "OH": skip,
            "RN": skip, "RP": skip, "RC": skip, "RG": skip,
            "RA": skip, "RT": skip, "RL": skip, "CC": skip,
            "PE": skip, "FT": skip, "DT": skip,
        }

        # UniProt sometimes has species not (yet) in the NCBI Taxonomy;
        # To avoid issues, map these IDs to the "unknown" species ID;
        # However, to do this, all valid species IDs need to be known:
        self._species_ids = frozenset(
            i[0] for i in self.session.query(Species.id)
        )
        return lines

    def _cleanup(self, stream: io.TextIOWrapper) -> int:
        return super(Parser, self)._cleanup(stream)

    def _parse(self, line: str) -> int:
        if line and not self._skip_sequence:
            return self._dispatcher[line[0:2]](line)
        elif self._skip_sequence and line.startswith('//'):
            return self._parseEND(line)
        else:
            return 0

    ID_RE = re.compile(
        'ID\s+'
        '(?P<id>\w+)\s+'
        '(?P<status>Reviewed|Unreviewed);\s+'
        '(?P<length>\d+)\s+'
        'AA\.'
    )

    def _parseID(self, line: str):
        mo = Parser.ID_RE.match(line)
        self._id = mo.group('id')
        self._length = int(mo.group('length'))
        return 0

    AC_RE = re.compile('\s+(?P<accession>[A-Z][0-9][A-Z0-9]{3}[0-9]);')

    def _parseAC(self, line: str):
        accessions = Parser.AC_RE.findall(line)

        # only once, even if there are multiple AC lines:
        if self.record is None:
            # ensure a species ID has to be set later:
            #noinspection PyTypeChecker
            self.record = ProteinRecord(-1, length=self._length)
            self.db_key = DBRef(Namespace.uniprot, accessions[0])

            if self._id:
                self.record.addSymbol(self._id)

            self.record.addDBRef(self.db_key)

        for acc in accessions:
            self.record.addString("accession", acc)

        return 0

    # No place to store record versions; Would this be useful?

    #    DT_RE = re.compile(
    #        'DT\s+\d{2}\-[A-Z]{3}\-\d{4}, entry version (?P<version>\d+)\s*\.'
    #    )
    #
    #    def _parseDT(self, line: str):
    #        mo = Parser.DT_RE.match(line)
    #
    #        if mo:
    #            self.record.version = mo.group('version')
    #
    #        return 0

    DE_RE = re.compile(
        'DE\s+'
        '(?:(?P<category>(?:Rec|Alt|Sub)Name|Flags|Contains|Includes):)?'
        '(?:\s*(?P<subcategory>[^=]+)(?:=(?P<name>.+))?)?'
    )

    def _parseDE(self, line: str):
        mo = Parser.DE_RE.match(line)
        cat = mo.group('category')
        subcat = mo.group('subcategory')
        name = mo.group('name')

        if cat in ('Flags', 'Contains', 'Includes'):
            return 0
        elif cat:
            self._name_cat = cat

        assert subcat is not None and name is not None, line
        assert name[-1] == ';', name
        name = name[:-1]

        # remove backslash on names ending with a backslash in TrEMBL
        while name.endswith('\\'):
            name = name[:-1]

        # swap rather peculiar short and full name assignments
        # treat the former as symobl and the latter as name
        if subcat == "Short" and len(name) > 16 and ' ' in name:
            subcat = "Full"

        if subcat == "Full" and len(name) < 6 and name.find(' ') == -1:
            subcat = "Short"

        if subcat == "Full":
            end = name.find(' ')

            if end == -1:
                end = len(name)

            if name[0].isupper() and name[1:end].islower():
                name = "{}{}".format(name[0].lower(), name[1:])

            if subcat == "Short" and name.startswith(
                    "uncharacterized protein") or \
                    name.startswith("putative ") or \
                    name.startswith("probable ") or \
                    name.startswith("similar to "):
                return 0

            comma = name.rfind(', ')

            while comma != -1:
                name = "{} {}".format(name[comma + 2:], name[:comma])
                comma = name.rfind(', ')

            if subcat == "Short" and name.startswith(
                    "uncharacterized protein") or \
                    name.startswith("putative ") or \
                    name.startswith("probable ") or \
                    name.startswith("similar to "):
                return 0

        if self._name_cat == 'RecName':
            if subcat == 'Full' and not self.record.name:
                self.record.name = name
            elif subcat == 'Short' and not self.record.symbol:
                self.record.symbol = name
            elif subcat == 'EC' and not self.record.symbol:
                self.record.symbol = name

        if subcat == 'Full':
            self.record.addName(name)
        elif subcat == 'Short':
            self.record.addSymbol(name)
        elif subcat == 'EC':
            self.record.addKeyword("EC{}".format(name))
        elif subcat in ('Allergen', 'Biotech', 'CD_antigen', 'INN'):
            pass
        else:
            raise RuntimeError(
                'unknown DE subcategory field "{}"'.format(subcat)
            )

        return 0

    GN_RE = re.compile('\s+(?P<key>\w+)\s*=\s*(?P<value>[^;]+);')

    def _parseGN(self, line: str):
        if line == 'and':
            return

        for key, value in Parser.GN_RE.findall(line):
            if key == 'Name':
                if len(value) < 16 or ' ' not in value:
                    self.record.addSymbol(value)
                else:
                    self.record.addName(value)
            elif key == 'Synonyms':
                for s in value.split(','):
                    s = s.strip()

                    if len(s) < 16 or ' ' not in s:
                        self.record.addSymbol(s)
                    else:
                        self.record.addName(s)
            elif key in ('OrderedLocusNames', 'ORFNames'):
                for s in value.split(','):
                    self.record.addKeyword(s.strip())
            else:
                raise RuntimeError(
                    'unknown GN category field "{}"'.format(key)
                )

        return 0

    OX_RE = re.compile('OX\s+NCBI_TaxID\s*=\s*(?P<species>\d+);')

    def _parseOX(self, line: str):
        matched = Parser.OX_RE.match(line)

        if matched:
            species = int(matched.group('species'))

            # UniProt declares TaxIDs that sometimes don't (yet) exist...
            if species not in self._species_ids:
                logging.debug('unknown species ID=%d for %s (%s)',
                              species, self.db_key.accession, self._id)
                species = SpeciesIds.unidentified
            else:
                logging.debug('known species ID=%d for %s (%s)',
                              species, self.db_key.accession, self._id)

            self.record.species_id = species

        return 0

    RX_RE = re.compile('RX\s+.*?PubMed\s*=\s*(?P<pmid>\d+);?')

    def _parseRX(self, line: str):
        matched = Parser.RX_RE.match(line)

        if matched:
            self.record.pmids.add(int(matched.group('pmid')))

        return 0

    DR_RE = re.compile(
        'DR\s+(?P<namespace>[\w/\-]+)\s*;\s+(?P<accessions>.*)'
    )

    def _parseDR(self, line: str):
        mo = Parser.DR_RE.match(line)
        namespace = None

        try:
            namespace = mo.group('namespace')
            # raise KeyError if unknown NSs are added:
            if TRANSLATE[namespace]:
                assert mo.group('accessions')[-1] == '.', mo.group(
                    'accessions')

                for db_ref in TRANSLATE[namespace]([
                    i.strip() for i in mo.group('accessions')[:-1].split(';')
                ]):
                    self.record.addDBRef(db_ref)
        except KeyError:
            logging.info("unknown Namespace '%s'", namespace)
        except AttributeError:
            pass

        return 0

    KW_RE = re.compile('\s+(?P<keyword>[^;]+)(?:;|\.$)')

    def _parseKW(self, line: str):
        for kwd in Parser.KW_RE.findall(line):
            if kwd != 'Complete proteome':
                self.record.addKeyword(kwd)

        return 0

    SQ_RE = re.compile(
        'SQ\s+SEQUENCE\s+'
        '(?P<length>\d+)\s+AA;\s+'
        '(?P<mass>\d+)\s+MW;\s+(?P<crc64>\w+)\s+CRC64;'
    )

    def _parseSQ(self, line: str):
        self.record.mass = int(Parser.SQ_RE.match(line).group('mass'))
        self._skip_sequence = True
        return 0

    #noinspection PyUnusedLocal
    def _parseEND(self, line: str):
        #noinspection PyTypeChecker
        self._loadRecord(self.db_key, self.record)
        self.db_key = None
        self.record = None
        self._id = ''
        self._length = None
        self._name_cat = None
        self._skip_sequence = False
        return 1