Example #1
0
File: mgi.py Project: fnl/gnamed
    def _getRecord(self, db_key: DBRef):
        if db_key in self._records:
            record = self._records[db_key]
        else:
            logging.debug('creating a new record for %s:%s', *db_key)
            record = GeneRecord(Species.mouse)
            record.addDBRef(db_key)
            self._records[db_key] = record

        return record
Example #2
0
File: sgd.py Project: fnl/gnamed
    def _parse(self, line: str):
        count = 0
        items = [i.strip() for i in line.split('\t')]
        assert len(items) == len(CONTENT), '{} items'.format(len(items))

        for i in range(len(items)):
            if items[i] == '""':
                items[i] = None

        row = Line._make(items)

        if self._db_key is None or row.id != self._db_key.accession:
            if self._record is not None:
                self._loadRecord(self._db_key, self._record)
                count = 1

            #noinspection PyTypeChecker
            record = GeneRecord(Species.budding_yeast,
                                symbol=row.symbol if row.symbol else row.location,
                                name=row.name,
                                chromosome=row.location[1],
                                location=row.location)

            # add DB references
            self._db_key = DBRef(Namespace.sgd, row.id)
            record.addDBRef(self._db_key)

            # add systematic name (= location) as a symbol
            if row.symbol:
                record.addSymbol(row.location)

            # add gene length as a keyword
            if row.length:
                record.addKeyword(row.length)

            # add protein names as alternative symbol names
            if row.protein_name:
                record.addSymbol(row.protein_name)

            # stack the record (multiple alias lines!)
            self._record = record

        if row.alias and row.alias not in (row.symbol, row.name, row.location):
            if " " in row.alias and len(row.alias) > 8:
                self._record.addName(row.alias)
            else:
                self._record.addSymbol(row.alias)

        return count
Example #3
0
File: tair.py Project: fnl/gnamed
 def _parseName(self, line: str):
     items = [i.strip() for i in line.split('\t')]
     if len(items) == 2:
         # noinspection PyTypeChecker,PyTypeChecker
         items.append(None)
     elif len(items[2]) > 0 and items[2][0] == '"' and items[2][-1] == '"':
         items[2] = items[2][1:-1]
     assert len(items) == 3, '{} items'.format(len(items))
     db_key = DBRef(Namespace.tair, items[0])
     #noinspection PyTypeChecker
     record = GeneRecord(Species.cress, symbol=items[1], name=items[2],
                         #chromosome=?, location=?
                         )
     record.addDBRef(db_key)
     record.addSymbol(items[1])
     if items[2] is not None:
         record.addName(items[2])
     self._records[db_key] = record
     logging.debug('parsed the name for %s:%s', *db_key)
     return 1
Example #4
0
File: rgd.py Project: fnl/gnamed
    def _parse(self, line: str):
        items = [i.strip() for i in line.split('\t')]
        assert len(items) > 32, '{} items'.format(len(items))

        for idx in COLUMNS:
            if not items[idx] or items[idx] == '-':
                items[idx] = None

        row = Line._make(items[col] for col in COLUMNS)
        #noinspection PyTypeChecker
        record = GeneRecord(Species.rat, symbol=row.symbol, name=row.name,
                            chromosome=row.chromosome,
                            location=(
                                '{}{}'.format(row.chromosome, row.location)
                                if row.chromosome and row.location else None
                            ))
        db_key = DBRef(Namespace.rgd, row.id)

        # add DB references
        record.addDBRef(db_key)

        for ns in (Namespace.entrez, Namespace.uniprot):
            accs = getattr(row, ns)

            if accs:
                if ns == Namespace.uniprot:
                    # noinspection PyUnresolvedReferences
                    for acc in accs.split(';'):
                        record.addDBRef(DBRef(ns, acc))
                else:
                    # noinspection PyUnresolvedReferences
                    accs = accs.split(';')
                    record.addDBRef(DBRef(ns, accs[0]))

        # parse symbol strings
        if row.symbol:
            record.addSymbol(row.symbol)

        for field in (row.old_symbols, row.qtl_symbols):
            if field:
                for symbol in field.split(';'):
                    record.addSymbol(symbol)

        # parse name strings
        if row.name:
            record.addName(row.name)

        if row.old_names:
            for name in row.old_names.split(';'):
                record.addName(name.strip())

        # parse keywords strings
        if row.descriptions:
            for desc in row.descriptions.split('; '):
                record.addKeyword(desc.strip())

        try:
            self._loadRecord(db_key, record)
        except DuplicateEntityError:
            accs = getattr(row, Namespace.entrez)

            if accs:
                # Entrez Gene is not unique, having created multiple GIs for
                # the same gene. Sometimes, single Entrez Genes are badly
                # linked by RGD, as in the case of RGD:69363 linking to
                # GI:113900, that should be linked to GI:10092108. This code
                # can update such artifacts in RGD, too, and eliminates the
                # duplicate Genes.
                logging.warning('removing duplicate rat genes for '
                                'rgd:%s with Entrez GIs %s',
                                row.id, accs)
                rgd_ref = self.session.query(GeneRef).filter(
                    GeneRef.accession == row.id
                ).filter(GeneRef.namespace == Namespace.rgd).one()
                logging.debug('correct %s links to gene:%s',
                              repr(rgd_ref), rgd_ref.id)
                orphan_genes = {}

                # Update retired RGD and Entrez entries by pointing the
                # outdated Refs to the right Gene (rgd_ref.id), while deleting
                # the "duplicate" Genes.
                # noinspection PyUnresolvedReferences
                for gi in accs.split(';'):
                    entrez_ref = self.session.query(GeneRef).filter(
                        GeneRef.accession == gi
                    ).filter(GeneRef.namespace == Namespace.entrez).one()

                    if entrez_ref.id != rgd_ref.id:
                        try:
                            retired_ref = self.session.query(GeneRef).filter(
                                GeneRef.id == entrez_ref.id
                            ).filter(GeneRef.namespace == Namespace.rgd).one()
                            logging.debug('updating %s and retired %s '
                                          'reference to orphan gene:%s',
                                          repr(entrez_ref), repr(retired_ref),
                                          entrez_ref.id)
                            retired_ref.id = rgd_ref.id
                        except NoResultFound:
                            logging.debug('updating %s reference '
                                          'to orphan gene:%s',
                                          repr(entrez_ref), entrez_ref.id)

                        if entrez_ref.id not in orphan_genes:
                            orphan_genes[entrez_ref.id] = self.session.query(
                                Gene
                            ).filter(
                                Gene.id == entrez_ref.id
                            ).one()

                        entrez_ref.id = rgd_ref.id

                for gene in orphan_genes.values():
                    self.session.delete(gene)

                self._flush()
                self._loadRecord(db_key, record)
            else:
                raise

        return 1
Example #5
0
File: hgnc.py Project: fnl/gnamed
    def _parse(self, line: str):
        items = [i.strip() for i in line.split('\t')]
        assert len(items) > 1, line

        for idx in range(len(items)):
            if items[idx] == '-':
                items[idx] = ''

        while len(items) < 16:
            items.append('')

        row = Line._make(items)
        record = GeneRecord(Species.human, symbol=row.symbol, name=row.name,
                            location=row.location if row.location else None)
        db_key = DBRef(Namespace.hgnc, row.id)
        record.addDBRef(db_key)

        # link DB references
        for ns in DB_REFS:
            acc = getattr(row, ns)

            if acc:
                if ns in FIX_ACCESSION:
                    # noinspection PyUnresolvedReferences
                    acc = acc[acc.find(":") + 1:]

                ref = DBRef(ns, acc)

                if ref in WRONG_DB_REFS:
                    new_ref = WRONG_DB_REFS[ref]
                    logging.info('correcting wrong ref %s->%s',
                                 '{}:{}'.format(*ref),
                                 '{}:{}'.format(*new_ref))
                    ref = new_ref

                record.addDBRef(ref)

        # parse symbol strings
        for field in (row.previous_symbols, row.synonyms):
            if field:
                for symbol in Parser._parseCD(field):
                    record.addSymbol(symbol)

        # parse name strings
        for field in (row.previous_names, row.name_synonyms):
            if field:
                for name in Parser._parseQCD(field):
                    record.addName(name)

        # parse keywords strings
        if row.gene_family_symbols:
            for kwd in Parser._parseCD(row.gene_family_symbols):
                record.addKeyword(kwd)

        # location with ALT_REF_LOCI values - eliminate them
        if row.location and " ALT_REF_LOCI" in row.location:
            record.location = row.location[:row.location.find(" ALT_REF_LOCI")]

        for name in Parser._parseQCD(row.gene_family_names):
            for subname in name.split(' / '):
                for subsubname in subname.split(' : '):
                    subsubname = subsubname.strip()

                    if subsubname.lower() not in ('other', '"other"'):
                        record.addKeyword(subsubname)

        try:
            self._loadRecord(db_key, record)
        except DuplicateEntityError:
            if len(record.refs) == 2:
                # assume all HGNC links that do not coincide with the
                # Entrez back-link are bad, as it seems it is mostly
                # HGNC that is not up-to-date.
                logging.warn('removing likely bad Entrez ref in %s:%s',
                             *db_key)
                assert any(r.namespace == Namespace.entrez
                           for r in record.refs), record.refs
                record.refs = {r for r in record.refs if
                               r.namespace == Namespace.hgnc}
                assert len(record.refs) == 1, record.refs
                self._loadRecord(db_key, record)
            else:
                raise

        return 1
Example #6
0
File: entrez.py Project: fnl/gnamed
    def _parseMain(self, line: str):
        # remove the backslash junk in the Entrez data file
        idx = line.find('\\')

        while idx != -1:
            if len(line) > idx + 1 and line[idx + 1].isalnum():
                line = '{}/{}'.format(line[:idx], line[idx + 1:])
            else:
                line = '{}{}'.format(line[:idx], line[idx + 1:])

            idx = line.find('\\', idx)

        items = [i.strip() for i in line.split('\t')]

        # ignore the undocumented "NEWENTRY" junk in the file
        if items[2] == 'NEWENTRY':
            return 0

        cleanChromosome = items[6].find('|')
        # drop (too long!) chr. strings with multiple chromosomes listed
        if cleanChromosome != -1:
            items[6] = items[6][0:cleanChromosome]

        for idx in range(len(items)):
            if items[idx] == '-':
                items[idx] = ""

        # remove any junk names from the official names/symbols
        for idx in [2, 8, 10, 11]:
            if items[idx] and items[idx].lower() in JUNK_NAMES:
                logging.debug(
                    'removing %s "%s" from %s:%s',
                    COLNAME[idx], items[idx], Namespace.entrez, items[1]
                )
                items[idx] = ""

        row = Line._make(items)
        # example of a bad symbol: gi:835054 (but accepted)
        assert not row.symbol or len(row.symbol) < 65, \
            '{}:{} has an illegal symbol="{}"'.format(
                Namespace.entrez, row.id, row.symbol
            )
        db_key = DBRef(Namespace.entrez, row.id)
        record = GeneRecord(row.species_id,
                            symbol=row.symbol,
                            name=row.name,
                            chromosome=row.chromosome,
                            location=row.map_location)
        record.addDBRef(db_key)

        # separate existing DB links and new DB references
        if row.dbxrefs:
            for xref in row.dbxrefs.split('|'):
                db, acc = xref.split(':')

                try:
                    if TRANSLATE[db]:
                        db_ref = DBRef(TRANSLATE[db], acc)

                        if db_ref not in self._generefs:
                            record.addDBRef(db_ref)
                            self._generefs.add(db_ref)
                except KeyError:
                    logging.warn('unknown dbXref to "%s"', db)

        # parsed symbol strings
        if row.nomenclature_symbol:
            record.addSymbol(row.nomenclature_symbol)

        if row.locus_tag:
            record.addSymbol(row.locus_tag)

        if row.synonyms:
            # clean up the synonym mess, moving names to where they
            # belong, e.g., gi:814702 cites "cleavage and polyadenylation
            # specificity factor 73 kDa subunit-II" as a gene symbol
            for sym in row.synonyms.split('|'):
                sym = sym.strip()

                if sym.lower() not in JUNK_NAMES:
                    if isGeneSymbol(sym):
                        record.addSymbol(sym)
                    else:
                        record.addName(sym)

        # parsed name strings
        if row.nomenclature_name:
            record.addName(row.nomenclature_name)

        if row.other_designations:
            # as with synonyms, at least skip the most frequent junk
            for name in row.other_designations.split('|'):
                name = name.strip()

                if name.lower() not in JUNK_NAMES:
                    if isGeneSymbol(name):
                        record.addSymbol(name)
                    else:
                        record.addName(name)

        # parsed keyword strings
        if row.type_of_gene and row.type_of_gene not in ('other', 'unknown'):
            record.addKeyword(row.type_of_gene)

        # add the PubMed links parsed earlier (if any):
        if db_key.accession in self._pmidMapping:
            record.pmids = self._pmidMapping[db_key.accession]

        self._loadRecord(db_key, record)
        return 1