Esempio n. 1
0
    def process(self):
        """ parse the Golm Metabolome Database standards library.
        such as
        those found at
        http://csbdb.mpimp-golm.mpg.de/csbdb/gmd/msri/gmd_msri.html
        """
        parent = self.parent
        parent.mm.setTableWeak('mpimpclass')
        # make sure that the necessary tables are in the db
        un = mask({}, parent.mm.idpatterns)
        l = parent.getLine()
        while l:
            # check and send off gathered data
            if re.match('^Name:', l):
                if un.nid(parent.mm) > 1:
                    parent.setMask(un)
                un = mask({}, parent.mm.idpatterns)
                x = re.findall('Name: (.+)', l)
                mpimpid = x[0].strip()
                un.append('mpimp', mpimpid, parent.confid,
                          parent.sourceid)
                # get synonyms from the title, the other synonyms are
                # just to full of crap
                spam = mpimpid.split('_')
                if spam[5].strip() != '':
                    un.append('synonym', spam[5], parent.confid,
                              parent.sourceid)
                    if re.match(".*\([0-9]+[tT][mM][sS]\)$",
                                spam[5]):
                        syn = re.findall("(.*)\([0-9]+[tT][mM][sS]\)$",
                                         spam[5])[0].strip()
                        un.append('synonym', syn,
                                  parent.confid, parent.sourceid)

            # ***** gather info *****
            # mpimp class 
            x = re.findall('Synonym: Metabolite \(class\):(.+)', l)
            if x:
                un.append('mpimpclass', x[0].strip(),
                          parent.mm.confidence['weak'],
                          parent.sourceid)
            # cas 1
            x = re.findall('CASNO: (.+)', l)
            if x:
                un.append('cas', x[0].strip(), parent.confid,
                          parent.sourceid)
            # cas 2
            x = re.findall('CAS\|(\d{1,7}-\d{2}-\d{1})_\(.+\)', l)
            if x:
                un.append('cas', x[0].strip(), parent.confid,
                          parent.sourceid)
            # kegg
            x = re.findall('KEGG\|([A-Z][0-9]{5})_.*\(.+\)', l)
            if x:
                un.append('kegg', x[0].strip(), parent.confid,
                          parent.sourceid)
            l = parent.getLine()
        # send of the last mask if we have one
        if un.nid(parent.mm) > 1:
            parent.setMask(un)
Esempio n. 2
0
    def process(self):
        parent = self.parent
        ll = True

        while ll:
            ll = parent.getLine()
            if not ll:
                break
            tmp = mask({})
            tmp.append('_id', ll)
            tmp2 = parent.mm.getMask(tmp, weak=True)
            if not tmp2:
                continue
            else:
                un = tmp2[0]
                if self.cs.fixInchi(un):
                    parent.setMask(un, setass=False)
                    tmp2 = parent.mm.getMask(tmp, weak=True)
                    if tmp2:
                        un = tmp2[0]
            try:
                csMasks = self.cs.getCsMasks(un, parent.mm)
            except:
                pdb.set_trace()
            if parent.mm.debug:
                print("#COMMENT mask " + str(ll) + " csid " + str(list(csMasks.keys())))
            cmsk = mask({})
            for csid in list(csMasks.keys()):
                csMasks[csid].setAllAssoc(parent.mm.addAss())
                cmsk.merge(csMasks[csid])
            parent.setMask(cmsk, setass=False)
Esempio n. 3
0
    def process(self):
        ll = True
        self.parent.mm.setTableWeak('formula')

        while ll:
            ll = self.parent.getLine()
            if not ll:
                break
            tmp = mask({})
            tmp.append('_id', ll)
            tmp2 = self.parent.mm.getMask(tmp)
            if not tmp2:
                continue
            else:
                un = tmp2[0]
            try:
                chebiMasks = self.chebip.getChebiMasks(un, self.parent.mm)
            except:
                pdb.set_trace()
            if self.parent.mm.debug:
                print("#COMMENT mask " + str(ll) + " chebi " +
                      str(list(chebiMasks.keys())))
            chMask = mask({})
            for ch in list(chebiMasks.keys()):
                chebiMasks[ch].setAllAssoc(self.parent.mm.addAss())
                chMask.merge(chebiMasks[ch])
            self.parent.setMask(chMask, setass=False)
Esempio n. 4
0
    def process(self):
        parent = self.parent
        weakTables = ['formula', 'weight', 'totalcharge', 'xlogp', 'hbonddonor',
                      'hbondacceptor', 'heavyatom', 'tpsa']
        list([parent.mm.setTableWeak(x) for x in weakTables])
        ll = True

        while ll:
            ll = parent.getLine()
            if not ll:
                break
            tmp = mask({})
            tmp.append('_id', ll)
            tmp2 = parent.mm.getMask(tmp, weak=True)
            if not tmp2:
                continue
            else:
                un = tmp2[0]
            try:
                pcMasks = self.pc.getPubchemMasks(un)
            except:
                print("#COMMENT unknown error")
                continue
            if parent.mm.debug:
                print("#COMMENT mask " + str(ll))
            msk = mask({})
            for pcmsk in pcMasks:
                pcmsk.setAllAssoc(parent.mm.addAss())
                msk.merge(pcmsk)
            parent.setMask(msk, setass=False)
Esempio n. 5
0
    def process(self):
        ll = True
        self.parent.mm.setTableWeak('formula')

        while ll:
            ll = self.parent.getLine()
            if not ll:
                break
            tmp = mask({})
            tmp.append('_id', ll)
            tmp2 = self.parent.mm.getMask(tmp)
            if not tmp2:
                continue
            else:
                un = tmp2[0]
            try:
                chebiMasks = self.chebip.getChebiMasks(un, self.parent.mm)
            except:
                pdb.set_trace()
            if self.parent.mm.debug:
                print("#COMMENT mask " + str(ll) + " chebi " + str(list(chebiMasks.keys())))
            chMask = mask({})
            for ch in list(chebiMasks.keys()):
                chebiMasks[ch].setAllAssoc(self.parent.mm.addAss())
                chMask.merge(chebiMasks[ch])
            self.parent.setMask(chMask, setass=False)
Esempio n. 6
0
    def process(self):
        parent = self.parent
        ll = True

        while ll:
            ll = parent.getLine()
            if not ll:
                break
            tmp = mask({})
            tmp.append('_id', ll)
            tmp2 = parent.mm.getMask(tmp, weak=True)
            if not tmp2:
                continue
            else:
                un = tmp2[0]
                if self.cs.fixInchi(un):
                    parent.setMask(un, setass=False)
                    tmp2 = parent.mm.getMask(tmp, weak=True)
                    if tmp2:
                        un = tmp2[0]
            try:
                csMasks = self.cs.getCsMasks(un, parent.mm)
            except:
                pdb.set_trace()
            if parent.mm.debug:
                print("#COMMENT mask " + str(ll) + " csid " +
                      str(list(csMasks.keys())))
            cmsk = mask({})
            for csid in list(csMasks.keys()):
                csMasks[csid].setAllAssoc(parent.mm.addAss())
                cmsk.merge(csMasks[csid])
            parent.setMask(cmsk, setass=False)
Esempio n. 7
0
    def process(self):
        parent = self.parent
        weakTables = [
            'formula', 'weight', 'totalcharge', 'xlogp', 'hbonddonor',
            'hbondacceptor', 'heavyatom', 'tpsa'
        ]
        list([parent.mm.setTableWeak(x) for x in weakTables])
        ll = True

        while ll:
            ll = parent.getLine()
            if not ll:
                break
            tmp = mask({})
            tmp.append('_id', ll)
            tmp2 = parent.mm.getMask(tmp, weak=True)
            if not tmp2:
                continue
            else:
                un = tmp2[0]
            try:
                pcMasks = self.pc.getPubchemMasks(un)
            except:
                print("#COMMENT unknown error")
                continue
            if parent.mm.debug:
                print("#COMMENT mask " + str(ll))
            msk = mask({})
            for pcmsk in pcMasks:
                pcmsk.setAllAssoc(parent.mm.addAss())
                msk.merge(pcmsk)
            parent.setMask(msk, setass=False)
Esempio n. 8
0
    def process(self):
        """ parse the compunds.dat file from the Cyc databases and insert the
        cycdb frame-id (uniqueid), synonyms, kegg and cas identifiers
        """
        parent = self.parent
        un = mask({}, parent.mm.idpatterns)  # new mask
        un.MIN_OVERLAP = 1
        ll = parent.getLine(comment='#')
        while ll:
            ll = ll.strip()
            if ll == '//':
                if un.nid() > 2:
                    try:
                        parent.setMask(un)
                    except:
                        # parent.mm.setMask(un, debug=True)
                        raise
                un = mask({}, parent.mm.idpatterns)
                un.MIN_OVERLAP = 1
            # frameid
            x = re.findall("UNIQUE-ID - (.*)", ll)
            if x:
                un.append(self.uniqueid, x[0], parent.confid, parent.sourceid)

            # synonym
            x = re.findall("COMMON-NAME - (.*)", ll)
            if x:
                un.append('synonym', x[0], parent.mm.confidence['weak'],
                          parent.sourceid)
            x = re.findall("SYNONYMS - (.*)", ll)
            if x:
                un.append('synonym', x[0], parent.mm.confidence['weak'],
                          parent.sourceid)

            # cas
            x = re.findall("DBLINKS - \(CAS \"(\d{1,7}-\d{2}-\d{1})\"", ll)
            if x:
                un.append('cas', x[0], parent.confid, parent.sourceid)

            # inchi
            x = re.findall("INCHI - (.*)", ll)
            if x:
                un.append('inchi', x[0], parent.confid, parent.sourceid)
            # smiles
            x = re.findall("SMILES - (.*)", ll)
            if x:
                un.append('smiles', x[0], parent.mm.confidence['weak'],
                          parent.sourceid)

            # kegg
            x = re.findall("DBLINKS - \(LIGAND-CPD \"([A-Z][0-9]{5})\"", ll)
            if x:
                un.append('kegg', x[0], parent.confid, parent.sourceid)
            ll = parent.getLine(comment='#')
        if un.nid() > 2:
            parent.setMask(un)
Esempio n. 9
0
    def process(self):
        """ parse the Golm Metabolome Database standards library.
        such as
        those found at
        http://csbdb.mpimp-golm.mpg.de/csbdb/gmd/msri/gmd_msri.html
        """
        parent = self.parent
        parent.mm.setTableWeak('mpimpclass')
        # make sure that the necessary tables are in the db
        un = mask({}, parent.mm.idpatterns)
        l = parent.getLine()
        while l:
            # check and send off gathered data
            if re.match('^Name:', l):
                if un.nid(parent.mm) > 1:
                    parent.setMask(un)
                un = mask({}, parent.mm.idpatterns)
                x = re.findall('Name: (.+)', l)
                mpimpid = x[0].strip()
                un.append('mpimp', mpimpid, parent.confid, parent.sourceid)
                # get synonyms from the title, the other synonyms are
                # just to full of crap
                spam = mpimpid.split('_')
                if spam[5].strip() != '':
                    un.append('synonym', spam[5], parent.confid,
                              parent.sourceid)
                    if re.match(".*\([0-9]+[tT][mM][sS]\)$", spam[5]):
                        syn = re.findall("(.*)\([0-9]+[tT][mM][sS]\)$",
                                         spam[5])[0].strip()
                        un.append('synonym', syn, parent.confid,
                                  parent.sourceid)

            # ***** gather info *****
            # mpimp class
            x = re.findall('Synonym: Metabolite \(class\):(.+)', l)
            if x:
                un.append('mpimpclass', x[0].strip(),
                          parent.mm.confidence['weak'], parent.sourceid)
            # cas 1
            x = re.findall('CASNO: (.+)', l)
            if x:
                un.append('cas', x[0].strip(), parent.confid, parent.sourceid)
            # cas 2
            x = re.findall('CAS\|(\d{1,7}-\d{2}-\d{1})_\(.+\)', l)
            if x:
                un.append('cas', x[0].strip(), parent.confid, parent.sourceid)
            # kegg
            x = re.findall('KEGG\|([A-Z][0-9]{5})_.*\(.+\)', l)
            if x:
                un.append('kegg', x[0].strip(), parent.confid, parent.sourceid)
            l = parent.getLine()
        # send of the last mask if we have one
        if un.nid(parent.mm) > 1:
            parent.setMask(un)
Esempio n. 10
0
 def GetCompoundInfo(self, mm, csid):
     """
     take an csid, get a mask
     parameters:
     -`mm`: a metmask database
     -`csid`: a chemspider identifier
     """
     tmpmask = mask({}, mm.idpatterns)
     csid = self.parent.urlSafe(csid)
     url = self.GetCompoundInfoURL + "CSID=" + str(csid) + "&token=" + self.token
     qRes = self.parent.getUrl(url)
     if not qRes:
         return (tmpmask)
     tmpmask.append('chemspider', csid, self.parent.confid, self.parent.sourceid)
     searchResults = xml.dom.minidom.parse(qRes)
     ids = \
         list(mmquery.nodecontents(searchResults.getElementsByTagName('CompoundInfo')))
     inchi = list(mmquery.nodecontents(searchResults.getElementsByTagName('InChI')))
     if inchi:
         for ide in inchi:
             tmpmask.append('inchi', ide, \
                            self.parent.confid, self.parent.sourceid)
     smiles = \
         list(mmquery.nodecontents(searchResults.getElementsByTagName('SMILES')))
     if smiles:
         for ide in smiles:
             tmpmask.append('smiles', ide, self.parent.confid, self.parent.sourceid)
     inchikey = \
         list(mmquery.nodecontents(searchResults.getElementsByTagName('InChIKey')))
     if inchikey:
         for ide in inchikey:
             tmpmask.append('inchikey', ide, self.parent.confid, self.parent.sourceid)
     return (tmpmask)
Esempio n. 11
0
    def getCsMasks(self, un, mm):
        """ take a mask and get filled mask from cs """
        res = {}
        tmpmask = mask({}, mm.idpatterns)
        if not any([x in self.queryTables for x in un.getTables()]):
            return (res)
        if un.hasTable('chemspider'):
            for csid in un.getIdentifiers('chemspider'):
                res[csid] = self.GetCompoundInfo(mm, csid)
                tmpmask.merge(res[csid])

        def filteredids(table, un, tmpmask):
            identifiers = []
            if un.hasTable(table):
                identifiers = un.getIdentifiers(table)
                if tmpmask.hasTable(table):
                    identifiers = [x for x in identifiers if x \
                                                   not in tmpmask.getIdentifiers(table)]
            return (identifiers)

        def fillresandmerge(ide, tmpmask, res):
            newids = self.SimpleSearch(ide)
            for csid in newids:
                if csid not in res:
                    res[csid] = self.GetCompoundInfo(mm, csid)
                    tmpmask.merge(res[csid])

        for ide in filteredids('smiles', un, tmpmask):
            fillresandmerge(ide, tmpmask, res)
        return (res)
Esempio n. 12
0
    def getCsMasks(self, un, mm):
        """ take a mask and get filled mask from cs """
        res = {}
        tmpmask = mask({}, mm.idpatterns)
        if not any([x in self.queryTables for x in un.getTables()]):
            return (res)
        if un.hasTable('chemspider'):
            for csid in un.getIdentifiers('chemspider'):
                res[csid] = self.GetCompoundInfo(mm, csid)
                tmpmask.merge(res[csid])

        def filteredids(table, un, tmpmask):
            identifiers = []
            if un.hasTable(table):
                identifiers = un.getIdentifiers(table)
                if tmpmask.hasTable(table):
                    identifiers = [x for x in identifiers if x \
                                                   not in tmpmask.getIdentifiers(table)]
            return (identifiers)

        def fillresandmerge(ide, tmpmask, res):
            newids = self.SimpleSearch(ide)
            for csid in newids:
                if csid not in res:
                    res[csid] = self.GetCompoundInfo(mm, csid)
                    tmpmask.merge(res[csid])

        for ide in filteredids('smiles', un, tmpmask):
            fillresandmerge(ide, tmpmask, res)
        return (res)
Esempio n. 13
0
    def process (self) :
        """ parse an SDF file and import any NISTID, synonyms, CASNO
        and formula
        """
        parent = self.parent
        un = mask({}, parent.mm.idpatterns) 
        ll = parent.getLine()
        parent.mm.setTableWeak('formula')

        while ll :
            if ll.startswith('$$$$') :
                if un.nid(parent.mm) > 1 :
                    parent.setMask(un)
                un = mask({}, parent.mm.idpatterns)
            # synonym
            if ll.startswith('>  <NAME>') or ll.startswith('>  <SYNONYMS>'):
                tmp = parent.getLine().strip()
                while tmp != "" :
                    un.append('synonym', tmp,
                                  parent.confid, parent.sourceid)
                    tmp = parent.getLine().strip()
            # nist
            if ll.startswith('>  <NISTNO>') :
                tmp = parent.getLine().strip()
                while tmp != "" :
                    un.append('nist', tmp,
                                  parent.confid, parent.sourceid)
                    tmp = parent.getLine().strip()
            # cas
            if ll.startswith('>  <CASNO>') :
                tmp = parent.getLine().strip()
                while tmp != "" :
                    un.append('cas', tmp,
                                  parent.confid, parent.sourceid)
                    tmp = parent.getLine().strip()
            # formula
            if ll.startswith('>  <FORMULA>') :
                tmp = parent.getLine().strip()
                while tmp != "" :
                    un.append('formula', tmp,
                                  parent.mm.confidence['weak'], parent.sourceid)
                    tmp = parent.getLine().strip()
            ll = parent.getLine()
        if un.nid(parent.mm) > 1 :
            parent.setMask(un)
Esempio n. 14
0
    def process(self):
        """ parse an SDF file and import any NISTID, synonyms, CASNO
        and formula
        """
        parent = self.parent
        un = mask({}, parent.mm.idpatterns)
        ll = parent.getLine()
        parent.mm.setTableWeak('formula')

        while ll:
            if ll.startswith('$$$$'):
                if un.nid(parent.mm) > 1:
                    parent.setMask(un)
                un = mask({}, parent.mm.idpatterns)
            # synonym
            if ll.startswith('>  <NAME>') or ll.startswith('>  <SYNONYMS>'):
                tmp = parent.getLine().strip()
                while tmp != "":
                    un.append('synonym', tmp, parent.confid, parent.sourceid)
                    tmp = parent.getLine().strip()
            # nist
            if ll.startswith('>  <NISTNO>'):
                tmp = parent.getLine().strip()
                while tmp != "":
                    un.append('nist', tmp, parent.confid, parent.sourceid)
                    tmp = parent.getLine().strip()
            # cas
            if ll.startswith('>  <CASNO>'):
                tmp = parent.getLine().strip()
                while tmp != "":
                    un.append('cas', tmp, parent.confid, parent.sourceid)
                    tmp = parent.getLine().strip()
            # formula
            if ll.startswith('>  <FORMULA>'):
                tmp = parent.getLine().strip()
                while tmp != "":
                    un.append('formula', tmp, parent.mm.confidence['weak'],
                              parent.sourceid)
                    tmp = parent.getLine().strip()
            ll = parent.getLine()
        if un.nid(parent.mm) > 1:
            parent.setMask(un)
Esempio n. 15
0
    def process(self):
        """ parse the RIKEN standards library file
        """
        parent = self.parent
        un = mask({}, parent.mm.idpatterns)
        ll = parent.getLine()
        parent.mm.setTableWeak('formula')

        while ll:
            x = re.findall('^Name: (.+)', ll)
            if x:
                if un.nid() > 1:
                    parent.setMask(un)
                un = mask({}, parent.mm.idpatterns)
                un.append('riken', x[0].strip(),
                          parent.confid, parent.sourceid)
            x = re.findall('CASNO: (\d{1,7}-\d{2}-\d{1})', ll)
            if x:
                un.append('cas', x[0].strip(),
                          parent.confid, parent.sourceid)
            x = re.findall('^KEGG: ([A-Z][0-9]{5})', ll)
            if x:
                un.append('kegg', x[0].strip(),
                          parent.confid, parent.sourceid)
            x = re.findall('^Formula: (.+)', ll)
            if x:
                un.append('formula', x[0].strip(),
                          parent.mm.confidence['weak'], parent.sourceid)
            x = re.findall('^SMILES: (.+)', ll)
            if x:
                un.append('smiles', x[0].strip(),
                          parent.mm.confidence['weak'], parent.sourceid)
            x = re.findall('Synonym: Name:(.+)', ll)
            if x:
                un.append('synonym', x[0].strip(),
                          parent.confid, parent.sourceid)
            ll = parent.getLine()
        if un.nid() > 1:
            parent.setMask(un)
Esempio n. 16
0
    def process(self):
        """ parse a file with metabocards from hmdb 
        """
        parent = self.parent
        un = mask({}, parent.mm.idpatterns)
        ll = parent.getLine(comment='\n')
        if not ll.startswith("#BEGIN_METABOCARD"):
            raise parserError("file does seem to contain metabocards: " + ll)
        parent.mm.setTableWeak('formula')
        ok = False

        while ll:
            if ll.startswith('#END_METABOCARD'):
                if un.nid(parent.mm) > 1:
                    parent.setMask(un)

            elif ll.startswith("#BEGIN_METABOCARD"):
                un = mask({}, parent.mm.idpatterns)

            elif ll.startswith("#"):
                ok = False
                if ll.strip() in self.tableDict:
                    con = self.tableDict[ll.strip()][0]
                    tab = self.tableDict[ll.strip()][1]
                    ok = True

            else:
                if ok:
                    identifiers = ll.split("; ")
                    for iden in identifiers:
                        if not iden.lower().startswith("not available"):
                            try:
                                un.append(tab, iden, con, parent.sourceid)
                            except idMisMatchError:
                                print("#OFFENDING LINE " + \
                                      str(parent.lineNum) + "@" + \
                                      tab + " : " + str(iden))
            ll = parent.getLine(comment='\n')
Esempio n. 17
0
    def process(self):
        """ parse a file with metabocards from hmdb 
        """
        parent = self.parent
        un = mask({}, parent.mm.idpatterns)
        ll = parent.getLine(comment='\n')
        if not ll.startswith("#BEGIN_METABOCARD"):
            raise parserError("file does seem to contain metabocards: " + ll)
        parent.mm.setTableWeak('formula')
        ok = False

        while ll:
            if ll.startswith('#END_METABOCARD'):
                if un.nid(parent.mm) > 1:
                    parent.setMask(un)

            elif ll.startswith("#BEGIN_METABOCARD"):
                un = mask({}, parent.mm.idpatterns)

            elif ll.startswith("#"):
                ok = False
                if ll.strip() in self.tableDict:
                    con = self.tableDict[ll.strip()][0]
                    tab = self.tableDict[ll.strip()][1]
                    ok = True

            else:
                if ok:
                    identifiers = ll.split("; ")
                    for iden in identifiers:
                        if not iden.lower().startswith("not available"):
                            try:
                                un.append(tab, iden, con, parent.sourceid)
                            except idMisMatchError:
                                print("#OFFENDING LINE " + \
                                      str(parent.lineNum) + "@" + \
                                      tab + " : " + str(iden))
            ll = parent.getLine(comment='\n')
Esempio n. 18
0
def test_insert_delete_mask(memory_db):
    memory_db.createIdTable('table1')
    memory_db.createIdTable('table2')
    un = mask()
    un.append('table1', 'foo', 'strong', 'test', 'baz')
    un.append('table2', 'bar', 'strong', 'test', 'baz')
    memory_db.setMask(un)
    result = memory_db.simpleQuery('foo', 'table1', 'table2')
    assert 'bar' in result[0][0]
    memory_db.dropMask(un)
    result = memory_db.simpleQuery('foo', 'table1', 'table2')
    assert len(result[0][0]) == 0
    memory_db.close()
    assert not os.path.exists(':memory:')
Esempio n. 19
0
    def process(self):
        """ parse the RIKEN standards library file
        """
        parent = self.parent
        un = mask({}, parent.mm.idpatterns)
        ll = parent.getLine()
        parent.mm.setTableWeak('formula')

        while ll:
            x = re.findall('^Name: (.+)', ll)
            if x:
                if un.nid() > 1:
                    parent.setMask(un)
                un = mask({}, parent.mm.idpatterns)
                un.append('riken', x[0].strip(), parent.confid,
                          parent.sourceid)
            x = re.findall('CASNO: (\d{1,7}-\d{2}-\d{1})', ll)
            if x:
                un.append('cas', x[0].strip(), parent.confid, parent.sourceid)
            x = re.findall('^KEGG: ([A-Z][0-9]{5})', ll)
            if x:
                un.append('kegg', x[0].strip(), parent.confid, parent.sourceid)
            x = re.findall('^Formula: (.+)', ll)
            if x:
                un.append('formula', x[0].strip(),
                          parent.mm.confidence['weak'], parent.sourceid)
            x = re.findall('^SMILES: (.+)', ll)
            if x:
                un.append('smiles', x[0].strip(), parent.mm.confidence['weak'],
                          parent.sourceid)
            x = re.findall('Synonym: Name:(.+)', ll)
            if x:
                un.append('synonym', x[0].strip(), parent.confid,
                          parent.sourceid)
            ll = parent.getLine()
        if un.nid() > 1:
            parent.setMask(un)
Esempio n. 20
0
 def process(self):
     parent = self.parent
     ncol = len(parent.tables)
     ## single identifier inserts should be allowed since they can
     ## might map to bins
     # if ncol < 2:
     #    raise fileFormatError, "Only one column, pointless insertion" 
     # make sure that the necessary tables are in the db
     ll = parent.getLine()
     ll = ll.replace("#", "\\#")
     while ll:
         ll = ll.replace("#", "\\#")
         try:
             vec = fixLine(ll, sep=parent.sep1)
             if len(vec) != ncol:
                 raise fileFormatError("Number of columns doesn't match the header :" \
                     + str(parent.lineNum) + ll)
             un = mask({}, parent.mm.idpatterns)
             for i in range(0, len(vec)):
                 idvec = vec[i].strip().split(parent.sep2)
                 idvec[0].strip()
                 if idvec[0] and not re.match(parent.na, idvec[0]):
                     for ide in idvec:
                         ide = ide.strip()
                         if not re.match(parent.na, ide):
                             try:
                                 un.append(parent.tables[i], ide,
                                           self.confids[i],
                                           parent.sourceid)
                             except idMisMatchError:
                                 print("#OFFENDING LINE " + \
                                       str(parent.lineNum) + "@" + \
                                       parent.tables[i] + \
                                       " : " + str(ide))
             # no empty masks
             if not un.isEmpty():
                 parent.setMask(un)
         except KeyboardInterrupt:
             raise Exception("Interrupt caught, breaking")
         except fileFormatError:
             print("#ERROR: format problem" + ll)
         except:
             print("#ERROR:" + ll)
         ll = parent.getLine()
Esempio n. 21
0
 def process(self):
     parent = self.parent
     ncol = len(parent.tables)
     ## single identifier inserts should be allowed since they can
     ## might map to bins
     # if ncol < 2:
     #    raise fileFormatError, "Only one column, pointless insertion"
     # make sure that the necessary tables are in the db
     ll = parent.getLine()
     ll = ll.replace("#", "\\#")
     while ll:
         ll = ll.replace("#", "\\#")
         try:
             vec = fixLine(ll, sep=parent.sep1)
             if len(vec) != ncol:
                 raise fileFormatError("Number of columns doesn't match the header :" \
                     + str(parent.lineNum) + ll)
             un = mask({}, parent.mm.idpatterns)
             for i in range(0, len(vec)):
                 idvec = vec[i].strip().split(parent.sep2)
                 idvec[0].strip()
                 if idvec[0] and not re.match(parent.na, idvec[0]):
                     for ide in idvec:
                         ide = ide.strip()
                         if not re.match(parent.na, ide):
                             try:
                                 un.append(parent.tables[i], ide,
                                           self.confids[i], parent.sourceid)
                             except idMisMatchError:
                                 print("#OFFENDING LINE " + \
                                       str(parent.lineNum) + "@" + \
                                       parent.tables[i] + \
                                       " : " + str(ide))
             # no empty masks
             if not un.isEmpty():
                 parent.setMask(un)
         except KeyboardInterrupt:
             raise Exception("Interrupt caught, breaking")
         except fileFormatError:
             print("#ERROR: format problem" + ll)
         except:
             print("#ERROR:" + ll)
         ll = parent.getLine()
Esempio n. 22
0
 def GetCompoundInfo(self, mm, csid):
     """
     take an csid, get a mask
     parameters:
     -`mm`: a metmask database
     -`csid`: a chemspider identifier
     """
     tmpmask = mask({}, mm.idpatterns)
     csid = self.parent.urlSafe(csid)
     url = self.GetCompoundInfoURL + "CSID=" + str(
         csid) + "&token=" + self.token
     qRes = self.parent.getUrl(url)
     if not qRes:
         return (tmpmask)
     tmpmask.append('chemspider', csid, self.parent.confid,
                    self.parent.sourceid)
     searchResults = xml.dom.minidom.parse(qRes)
     ids = \
         list(mmquery.nodecontents(searchResults.getElementsByTagName('CompoundInfo')))
     inchi = list(
         mmquery.nodecontents(searchResults.getElementsByTagName('InChI')))
     if inchi:
         for ide in inchi:
             tmpmask.append('inchi', ide, \
                            self.parent.confid, self.parent.sourceid)
     smiles = \
         list(mmquery.nodecontents(searchResults.getElementsByTagName('SMILES')))
     if smiles:
         for ide in smiles:
             tmpmask.append('smiles', ide, self.parent.confid,
                            self.parent.sourceid)
     inchikey = \
         list(mmquery.nodecontents(searchResults.getElementsByTagName('InChIKey')))
     if inchikey:
         for ide in inchikey:
             tmpmask.append('inchikey', ide, self.parent.confid,
                            self.parent.sourceid)
     return (tmpmask)
Esempio n. 23
0
    def process(self):
        """ parse a cyc dump file and fectch cas, kegg, synonym and
        cycpath
        """
        parent = self.parent
        last = ""
        parent.mm.setTableWeak('cycpath')
        parent.mm.setTableWeak('formula')

        l = parent.getLine()
        while l:
            l = l.replace("'", "\\'")  # cyc likes apostrophes
            l = l.replace("#", "\#")  # hashes in the middle of lines
            vec = fixLine(l, sep=parent.sep1)

            if len(vec) < 6:
                l = parent.getLine()
                continue
            if len(vec) == 9:
                vec = [vec[0]] + [vec[1]] + [vec[5]] + [vec[8].strip()] + [vec[3]] + [vec[4]]
            else:
                vec = [vec[0]] + [vec[1]] + [vec[5]] + [''] + [vec[3]] + [vec[4]]

            if vec == last:
                l = parent.getLine()
                continue
            un = mask({}, parent.mm.idpatterns)
            if len(vec) == 6:
                # pointless if we dont have links
                if vec[2] != '':
                    # cas
                    x = re.findall('CAS:(\d{1,7}-\d{2}-\d{1})',
                                   vec[2])
                    for casno in x:
                        un.append('cas', casno, parent.confid,
                                  parent.sourceid)
                    # kegg
                    x = re.findall('LIGAND-CPD:([A-Z][0-9]{5})',
                                   vec[2])
                    for kegg in x:
                        un.append('kegg', kegg, parent.confid,
                                  parent.sourceid)
                    # pubchem
                    x = re.findall('PUBCHEM:([0-9]+)',
                                   vec[2])
                    for kegg in x:
                        un.append('pubchem', kegg, parent.confid,
                                  parent.sourceid)
                    # knapsack
                    x = re.findall('KNAPSACK:([0-9]+)',
                                   vec[2])
                    for kegg in x:
                        un.append('knapsack', kegg, parent.confid,
                                  parent.sourceid)
                    # formula
                    if vec[4] != '':
                        un.append('formula', vec[4].replace(" ", ""),
                                  parent.mm.confidence['weak'],
                                  parent.sourceid)
                    # smiles
                    if vec[5] != '':
                        un.append('smiles', vec[5],
                                  parent.mm.confidence['weak'],
                                  parent.sourceid)
                    # synonym
                    un.append('synonym', vec[0], parent.mm.confidence['weak'],
                              parent.sourceid)
                    if vec[1] != '':
                        x = vec[1].split("*")
                        for syn in x:
                            un.append('synonym', syn,
                                      parent.mm.confidence['weak'],
                                      parent.sourceid)
                    if vec[3] != '':
                        un.append('cycpath', vec[3],
                                  parent.mm.confidence['weak'],
                                  parent.sourceid)
                    if un.nid() < 2 or \
                            all([not un.hasTable('cas'), not un.hasTable('kegg')]):
                        un = mask({}, parent.mm.idpatterns)
            if un.nid() > 1:
                parent.setMask(un)
            last = vec
            l = parent.getLine()
Esempio n. 24
0
    def pubchem2mask(self, docSum):
        """ turn a docsum node in to a mask
        """
        un = mask({}, self.parent.mm.idpatterns)
        cid = next(mmquery.nodecontents(docSum.getElementsByTagName("Id")))
        un.append('cid', cid, self.parent.confid, self.parent.sourceid)
        p = re.compile('(<a href[^>]*>)|(</a>)|(ligand)|(,)')
        cnf = self.parent.confid
        src = self.parent.sourceid
        weak = self.parent.mm.confidence['weak']

        for item in docSum.getElementsByTagName('Item'):
            if len(item.childNodes) == 0:
                continue
            nVal = item.childNodes[0].nodeValue
            if item.getAttribute('Name') == 'SynonymList':
                synonyms = list(mmquery.nodecontents(item.childNodes))
                for s in synonyms:
                    s = p.sub('', s.strip())
                    if 'kegg' in guessTable(s, self.parent.mm.idpatterns):
                        un.append('kegg', s, cnf, src)
                        continue
                    if 'cas' in guessTable(s, self.parent.mm.idpatterns):
                        un.append('cas', s, cnf, src)
                        continue
                    if 'chebi' in guessTable(s, self.parent.mm.idpatterns):
                        un.append('chebi', s, cnf, src)
                        continue
                    if 'inchi' in guessTable(s, self.parent.mm.idpatterns):
                        un.append('inchi', s, cnf, src)
                        continue
                    else:
                        un.append('synonym', s, weak, src)
            elif item.getAttribute('Name') == 'IUPACName':
                un.append('iupac', nVal, cnf, src)
            elif item.getAttribute('Name') == 'CanonicalSmile':
                un.append('smiles', nVal, cnf, src)
            elif item.getAttribute('Name') == 'CanonicalSmile':
                un.append('smiles', nVal, cnf, src)
            elif item.getAttribute('Name') == 'InChIKey':
                un.append('inchikey', nVal, cnf, src)

            # annotation section
            elif item.getAttribute('Name') == 'MolecularFormula':
                un.append('formula', nVal, weak, src)
            elif item.getAttribute('Name') == 'MolecularWeight':
                un.append('weight', nVal, weak, src)
            elif item.getAttribute('Name') == 'TotalFormalCharge':
                un.append('totalcharge', nVal, weak, src)
            elif item.getAttribute('Name') == 'XLogP':
                un.append('xlogp', nVal, weak, src)
            elif item.getAttribute('Name') == 'XLogP':
                un.append('xlogp', nVal, weak, src)
            elif item.getAttribute('Name') == 'HydrogenBondDonorCount':
                un.append('hbonddonor', nVal, weak, src)
            elif item.getAttribute('Name') == 'HydrogenBondAcceptorCount':
                un.append('hbondacceptor', nVal, weak, src)
            elif item.getAttribute('Name') == 'HeavyAtomCount':
                un.append('heavyatom', nVal, weak, src)
            elif item.getAttribute('Name') == 'TPSA':
                un.append('tpsa', nVal, weak, src)
        return (un)
Esempio n. 25
0
    def chebi2mask(self, mm, chebiId):
        """
        get a mask containing the info associated with a chebi id
        2. use getComplete to fetch the contents of the relevant entries
        """
        ba = chebiId
        un = mask({}, mm.idpatterns)
        qUrl = self.getComplete + "chebiId=" + str(chebiId)
        qRes = self.parent.getUrl(qUrl)
        if not qRes:
            return (mask({}))
        searchResults = xml.dom.minidom.parse(qRes)
        if not searchResults:
            return (mask({}))
        if searchResults.getElementsByTagName('ns1:return'):
            retList = searchResults.getElementsByTagName('ns1:return')[0]
        # found non-existent chebiId
        else:
            # this would have been reasonable but chebi has problems so that some
            # entries exist, but cant be fetched. valid chebi but we cant query
            # for it,
            # hence, skip completely
            # delmask = mask({})
            # delmask.append('chebi', chebiId)
            # mm.brandish(delmask)
            return (mask({}))
        # chebiid
        if retList.getElementsByTagName("ns1:chebiId"):
            chebiId = list(query.nodecontents(retList.getElementsByTagName("ns1:chebiId")))[0]
            un.append('chebi', chebiId.replace("CHEBI:", ""),
                      self.parent.confid, self.parent.sourceid)

        # smiles
        if retList.getElementsByTagName("ns1:smiles"):
            un.append('smiles', list(query.nodecontents(retList.getElementsByTagName("ns1:smiles")))[0],
                      self.parent.confid, self.parent.sourceid)

        # synonym
        if searchResults.getElementsByTagName('ns1:Synonyms'):
            syns = searchResults.getElementsByTagName('ns1:Synonyms')[0]
            for sy in list(query.nodecontents(syns.getElementsByTagName("ns1:data"))):
                un.append('synonym', sy, self.parent.mm.confidence['weak'], \
                          self.parent.sourceid)

        # inchi
        if retList.getElementsByTagName("ns1:inchi"):
            un.append('inchi', list(query.nodecontents(retList.getElementsByTagName("ns1:inchi")))[0],
                      self.parent.confid, self.parent.sourceid)

        # iupac
        if searchResults.getElementsByTagName('ns1:IupacNames'):
            syns = searchResults.getElementsByTagName('ns1:IupacNames')[0]
            for sy in list(query.nodecontents(syns.getElementsByTagName("ns1:data"))):
                un.append('iupac', sy, self.parent.confid, \
                          self.parent.sourceid)

        # kegg
        for ll in searchResults.getElementsByTagName('ns1:DatabaseLinks'):
            if list(query.nodecontents(ll.getElementsByTagName("ns1:type")))[0] == 'KEGG COMPOUND accession':
                for sy in list(query.nodecontents(ll.getElementsByTagName("ns1:data"))):
                    un.append('kegg', sy, self.parent.confid, self.parent.sourceid)

        # cas
        for ll in searchResults.getElementsByTagName('ns1:RegistryNumbers'):
            if list(query.nodecontents(ll.getElementsByTagName("ns1:type")))[0] == 'CAS Registry Number':
                for sy in list(query.nodecontents(ll.getElementsByTagName("ns1:data"))):
                    un.append('cas', sy, self.parent.confid, self.parent.sourceid)

        # pdb.set_trace()
        # formula
        if searchResults.getElementsByTagName('ns1:Formulae'):
            form = searchResults.getElementsByTagName('ns1:Formulae')[0]
            for sy in list(query.nodecontents(form.getElementsByTagName("ns1:data"))):
                un.append('formula', sy, self.parent.mm.confidence['weak'], \
                          self.parent.sourceid)

        return (un)
Esempio n. 26
0
    def getChebiMasks(self, un, mm):
        """
        1. use getLite to query for all relevant entries in the mask
        2. Query for chebi, iupac, cas, kegg, inchi, smiles
           1. Get only exact matches
        3. Take the unique ones
        4. Get all is_a children of the found chebis and add them
        Result is a list of masks
        """
        res = {}
        # pdb.set_trace()
        tmpmask = mask({}, mm.idpatterns)
        if not any([x in self.queryTables for x in un.getTables()]):
            return (res)
        if un.hasTable('chebi'):
            for ch in un.getIdentifiers('chebi'):
                res[ch] = self.chebi2mask(mm, ch)
                tmpmask.merge(res[ch])

        def filteredids(table, un, tmpmask):
            identifiers = []
            if un.hasTable(table):
                identifiers = un.getIdentifiers(table)
                if tmpmask.hasTable(table):
                    # to start querying for the same identifiers several times
                    identifiers = [x for x in identifiers if x not in tmpmask.getIdentifiers(table)]
            return (identifiers)

        def fillresandmerge(qUrl, tmpmask, res):
            newids = self.url2ids(qUrl)
            for ch in newids:
                if ch not in res:
                    res[ch] = self.chebi2mask(mm, ch)
                    tmpmask.merge(res[ch])

        for ide in filteredids('iupac', un, tmpmask):
            qUrl = self.getLite + "search=" + \
                   self.parent.urlSafe(ide) + "&searchCategory=IUPAC+NAME"
            fillresandmerge(qUrl, tmpmask, res)

        for ide in filteredids('cas', un, tmpmask):
            qUrl = self.getLite + "search=" + \
                   self.parent.urlSafe(ide) + "&searchCategory=REGISTRY+NUMBER"
            fillresandmerge(qUrl, tmpmask, res)

        for ide in filteredids('kegg', un, tmpmask):
            qUrl = self.getLite + "search=" + \
                   self.parent.urlSafe(ide) + "&searchCategory=DATABASE+LINK"
            fillresandmerge(qUrl, tmpmask, res)

        for ide in filteredids('inchi', un, tmpmask):
            qUrl = self.getLite + "search=" + \
                   self.parent.urlSafe(ide) + "&searchCategory=INCHI"
            fillresandmerge(qUrl, tmpmask, res)

        for ide in filteredids('smiles', un, tmpmask):
            qUrl = self.getLite + "search=" + \
                   self.parent.urlSafe(ide) + "&searchCategory=SMILES"
            fillresandmerge(qUrl, tmpmask, res)

        foundIds = list(res.keys())
        children = []
        for ch in foundIds:
            children = self.getChebiChildren(ch)
            for child in children:
                if child not in res:
                    res[child] = self.chebi2mask(mm, child)
        return (res)
Esempio n. 27
0
    def process(self):
        """ parse the compunds.dat file from the Cyc databases and insert the
        cycdb frame-id (uniqueid), synonyms, kegg and cas identifiers
        """
        parent = self.parent
        un = mask({}, parent.mm.idpatterns)  # new mask
        un.MIN_OVERLAP = 1
        ll = parent.getLine(comment='#')
        while ll:
            ll = ll.strip()
            if ll == '//':
                if un.nid() > 2:
                    try:
                        parent.setMask(un)
                    except:
                        # parent.mm.setMask(un, debug=True)
                        raise
                un = mask({}, parent.mm.idpatterns)
                un.MIN_OVERLAP = 1
            # frameid
            x = re.findall("UNIQUE-ID - (.*)", ll)
            if x:
                un.append(self.uniqueid, x[0], parent.confid,
                          parent.sourceid)

            # synonym
            x = re.findall("COMMON-NAME - (.*)", ll)
            if x:
                un.append('synonym', x[0],
                          parent.mm.confidence['weak'],
                          parent.sourceid)
            x = re.findall("SYNONYMS - (.*)", ll)
            if x:
                un.append('synonym', x[0],
                          parent.mm.confidence['weak'],
                          parent.sourceid)

            # cas
            x = re.findall("DBLINKS - \(CAS \"(\d{1,7}-\d{2}-\d{1})\"", ll)
            if x:
                un.append('cas', x[0], parent.confid,
                          parent.sourceid)

            # inchi
            x = re.findall("INCHI - (.*)", ll)
            if x:
                un.append('inchi', x[0], parent.confid,
                          parent.sourceid)
            # smiles
            x = re.findall("SMILES - (.*)", ll)
            if x:
                un.append('smiles', x[0], parent.mm.confidence['weak'],
                          parent.sourceid)

            # kegg
            x = re.findall("DBLINKS - \(LIGAND-CPD \"([A-Z][0-9]{5})\"", ll)
            if x:
                un.append('kegg', x[0], parent.confid,
                          parent.sourceid)
            ll = parent.getLine(comment='#')
        if un.nid() > 2:
            parent.setMask(un)
Esempio n. 28
0
    def getChebiMasks(self, un, mm):
        """
        1. use getLite to query for all relevant entries in the mask
        2. Query for chebi, iupac, cas, kegg, inchi, smiles
           1. Get only exact matches
        3. Take the unique ones
        4. Get all is_a children of the found chebis and add them
        Result is a list of masks
        """
        res = {}
        # pdb.set_trace()
        tmpmask = mask({}, mm.idpatterns)
        if not any([x in self.queryTables for x in un.getTables()]):
            return (res)
        if un.hasTable('chebi'):
            for ch in un.getIdentifiers('chebi'):
                res[ch] = self.chebi2mask(mm, ch)
                tmpmask.merge(res[ch])

        def filteredids(table, un, tmpmask):
            identifiers = []
            if un.hasTable(table):
                identifiers = un.getIdentifiers(table)
                if tmpmask.hasTable(table):
                    # to start querying for the same identifiers several times
                    identifiers = [
                        x for x in identifiers
                        if x not in tmpmask.getIdentifiers(table)
                    ]
            return (identifiers)

        def fillresandmerge(qUrl, tmpmask, res):
            newids = self.url2ids(qUrl)
            for ch in newids:
                if ch not in res:
                    res[ch] = self.chebi2mask(mm, ch)
                    tmpmask.merge(res[ch])

        for ide in filteredids('iupac', un, tmpmask):
            qUrl = self.getLite + "search=" + \
                   self.parent.urlSafe(ide) + "&searchCategory=IUPAC+NAME"
            fillresandmerge(qUrl, tmpmask, res)

        for ide in filteredids('cas', un, tmpmask):
            qUrl = self.getLite + "search=" + \
                   self.parent.urlSafe(ide) + "&searchCategory=REGISTRY+NUMBER"
            fillresandmerge(qUrl, tmpmask, res)

        for ide in filteredids('kegg', un, tmpmask):
            qUrl = self.getLite + "search=" + \
                   self.parent.urlSafe(ide) + "&searchCategory=DATABASE+LINK"
            fillresandmerge(qUrl, tmpmask, res)

        for ide in filteredids('inchi', un, tmpmask):
            qUrl = self.getLite + "search=" + \
                   self.parent.urlSafe(ide) + "&searchCategory=INCHI"
            fillresandmerge(qUrl, tmpmask, res)

        for ide in filteredids('smiles', un, tmpmask):
            qUrl = self.getLite + "search=" + \
                   self.parent.urlSafe(ide) + "&searchCategory=SMILES"
            fillresandmerge(qUrl, tmpmask, res)

        foundIds = list(res.keys())
        children = []
        for ch in foundIds:
            children = self.getChebiChildren(ch)
            for child in children:
                if child not in res:
                    res[child] = self.chebi2mask(mm, child)
        return (res)
Esempio n. 29
0
    def process(self):
        """ parse a cyc dump file and fectch cas, kegg, synonym and
        cycpath
        """
        parent = self.parent
        last = ""
        parent.mm.setTableWeak('cycpath')
        parent.mm.setTableWeak('formula')

        l = parent.getLine()
        while l:
            l = l.replace("'", "\\'")  # cyc likes apostrophes
            l = l.replace("#", "\#")  # hashes in the middle of lines
            vec = fixLine(l, sep=parent.sep1)

            if len(vec) < 6:
                l = parent.getLine()
                continue
            if len(vec) == 9:
                vec = [vec[0]] + [vec[1]] + [vec[5]] + [vec[8].strip()] + [
                    vec[3]
                ] + [vec[4]]
            else:
                vec = [vec[0]] + [vec[1]] + [vec[5]] + [''] + [vec[3]
                                                               ] + [vec[4]]

            if vec == last:
                l = parent.getLine()
                continue
            un = mask({}, parent.mm.idpatterns)
            if len(vec) == 6:
                # pointless if we dont have links
                if vec[2] != '':
                    # cas
                    x = re.findall('CAS:(\d{1,7}-\d{2}-\d{1})', vec[2])
                    for casno in x:
                        un.append('cas', casno, parent.confid, parent.sourceid)
                    # kegg
                    x = re.findall('LIGAND-CPD:([A-Z][0-9]{5})', vec[2])
                    for kegg in x:
                        un.append('kegg', kegg, parent.confid, parent.sourceid)
                    # pubchem
                    x = re.findall('PUBCHEM:([0-9]+)', vec[2])
                    for kegg in x:
                        un.append('pubchem', kegg, parent.confid,
                                  parent.sourceid)
                    # knapsack
                    x = re.findall('KNAPSACK:([0-9]+)', vec[2])
                    for kegg in x:
                        un.append('knapsack', kegg, parent.confid,
                                  parent.sourceid)
                    # formula
                    if vec[4] != '':
                        un.append('formula', vec[4].replace(" ", ""),
                                  parent.mm.confidence['weak'],
                                  parent.sourceid)
                    # smiles
                    if vec[5] != '':
                        un.append('smiles', vec[5],
                                  parent.mm.confidence['weak'],
                                  parent.sourceid)
                    # synonym
                    un.append('synonym', vec[0], parent.mm.confidence['weak'],
                              parent.sourceid)
                    if vec[1] != '':
                        x = vec[1].split("*")
                        for syn in x:
                            un.append('synonym', syn,
                                      parent.mm.confidence['weak'],
                                      parent.sourceid)
                    if vec[3] != '':
                        un.append('cycpath', vec[3],
                                  parent.mm.confidence['weak'],
                                  parent.sourceid)
                    if un.nid() < 2 or \
                            all([not un.hasTable('cas'), not un.hasTable('kegg')]):
                        un = mask({}, parent.mm.idpatterns)
            if un.nid() > 1:
                parent.setMask(un)
            last = vec
            l = parent.getLine()
Esempio n. 30
0
    def chebi2mask(self, mm, chebiId):
        """
        get a mask containing the info associated with a chebi id
        2. use getComplete to fetch the contents of the relevant entries
        """
        ba = chebiId
        un = mask({}, mm.idpatterns)
        qUrl = self.getComplete + "chebiId=" + str(chebiId)
        qRes = self.parent.getUrl(qUrl)
        if not qRes:
            return (mask({}))
        searchResults = xml.dom.minidom.parse(qRes)
        if not searchResults:
            return (mask({}))
        if searchResults.getElementsByTagName('ns1:return'):
            retList = searchResults.getElementsByTagName('ns1:return')[0]
        # found non-existent chebiId
        else:
            # this would have been reasonable but chebi has problems so that some
            # entries exist, but cant be fetched. valid chebi but we cant query
            # for it,
            # hence, skip completely
            # delmask = mask({})
            # delmask.append('chebi', chebiId)
            # mm.brandish(delmask)
            return (mask({}))
        # chebiid
        if retList.getElementsByTagName("ns1:chebiId"):
            chebiId = list(
                query.nodecontents(
                    retList.getElementsByTagName("ns1:chebiId")))[0]
            un.append('chebi', chebiId.replace("CHEBI:", ""),
                      self.parent.confid, self.parent.sourceid)

        # smiles
        if retList.getElementsByTagName("ns1:smiles"):
            un.append(
                'smiles',
                list(
                    query.nodecontents(
                        retList.getElementsByTagName("ns1:smiles")))[0],
                self.parent.confid, self.parent.sourceid)

        # synonym
        if searchResults.getElementsByTagName('ns1:Synonyms'):
            syns = searchResults.getElementsByTagName('ns1:Synonyms')[0]
            for sy in list(
                    query.nodecontents(syns.getElementsByTagName("ns1:data"))):
                un.append('synonym', sy, self.parent.mm.confidence['weak'], \
                          self.parent.sourceid)

        # inchi
        if retList.getElementsByTagName("ns1:inchi"):
            un.append(
                'inchi',
                list(
                    query.nodecontents(
                        retList.getElementsByTagName("ns1:inchi")))[0],
                self.parent.confid, self.parent.sourceid)

        # iupac
        if searchResults.getElementsByTagName('ns1:IupacNames'):
            syns = searchResults.getElementsByTagName('ns1:IupacNames')[0]
            for sy in list(
                    query.nodecontents(syns.getElementsByTagName("ns1:data"))):
                un.append('iupac', sy, self.parent.confid, \
                          self.parent.sourceid)

        # kegg
        for ll in searchResults.getElementsByTagName('ns1:DatabaseLinks'):
            if list(query.nodecontents(ll.getElementsByTagName(
                    "ns1:type")))[0] == 'KEGG COMPOUND accession':
                for sy in list(
                        query.nodecontents(
                            ll.getElementsByTagName("ns1:data"))):
                    un.append('kegg', sy, self.parent.confid,
                              self.parent.sourceid)

        # cas
        for ll in searchResults.getElementsByTagName('ns1:RegistryNumbers'):
            if list(query.nodecontents(ll.getElementsByTagName(
                    "ns1:type")))[0] == 'CAS Registry Number':
                for sy in list(
                        query.nodecontents(
                            ll.getElementsByTagName("ns1:data"))):
                    un.append('cas', sy, self.parent.confid,
                              self.parent.sourceid)

        # pdb.set_trace()
        # formula
        if searchResults.getElementsByTagName('ns1:Formulae'):
            form = searchResults.getElementsByTagName('ns1:Formulae')[0]
            for sy in list(
                    query.nodecontents(form.getElementsByTagName("ns1:data"))):
                un.append('formula', sy, self.parent.mm.confidence['weak'], \
                          self.parent.sourceid)

        return (un)
Esempio n. 31
0
    def process(self):
        parent = self.parent
        # annotate this table as fully weak
        parent.mm.setTableWeak('pathway')
        parent.mm.setTableWeak('formula')

        ll = parent.getLine()
        if not re.match("^ENTRY", ll):
            raise fileFormatError("COMPOUNDS file doesn't look like expected")

        un = mask({}, parent.mm.idpatterns)
        good = True

        while ll:
            # check and send off gathered data
            # kegg
            x = re.findall('ENTRY *([A-Z][0-9]{5}) ', ll)
            if x:
                if not un.isEmpty() and good:
                    parent.setMask(un)

                un = mask({}, parent.mm.idpatterns)  # new mask
                keggno = x[0].strip()
                un.append('kegg', keggno, parent.confid,
                          parent.sourceid)
                good = True

            # synonym
            x = re.findall('NAME +(.+)', ll)
            if x:
                un.append('synonym',
                          x[0].strip().replace(";", ""),
                          parent.confid, parent.sourceid)
                while re.match(".+;$", ll):
                    ll = parent.getLine()
                    un.append('synonym',
                              ll.strip().replace(";", ""),
                              parent.confid,
                              parent.sourceid)

            # formula
            x = re.findall('FORMULA +(.+)', ll)
            if x:
                un.append('formula', x[0], parent.mm.confidence['weak'],
                          parent.sourceid)
            # knapsack
            x = re.findall('KNApSAcK: (.+)', ll)
            if x:
                un.append('knapsack', x[0], parent.confid,
                          parent.sourceid)
            # cas
            if re.findall('CAS:', ll):
                x = re.findall('(\d{2,7}-\d{2}-\d{1})', ll)
                for xx in x:
                    un.append('cas', xx, parent.confid,
                              parent.sourceid)
            # sid <- obs not cid!
            x = re.findall('PubChem: (\d+)', ll)
            if x:
                un.append('sid', x[0], parent.confid,
                          parent.sourceid)
            # chebi
            x = re.findall('ChEBI: (\d+)', ll)
            if x:
                un.append('chebi', x[0], parent.confid,
                          parent.sourceid)
            # pathway
            x = re.findall('(PATHWAY)*\s*(ko|map)(\d+)', ll)
            if x:
                un.append('pathway', x[0][2],
                          parent.mm.confidence['weak'],
                          parent.sourceid)

            # comment, to get rid of kegg internal generics
            if re.match('ENTRY *[A-Z][0-9]{5}$ *Peptide *Compound$', ll):
                good = False
            if re.match('COMMENT', ll):
                if re.match('COMMENT     generic compound in reaction hierarchy', ll) or \
                        re.match('COMMENT     coordination compound', ll) or \
                        re.match('COMMENT.+R=.+', ll):
                    good = False

            ll = parent.getLine()

        # send of the last mask if we have one
        if not un.isEmpty() and good:
            parent.setMask(un)
Esempio n. 32
0
    def pubchem2mask(self, docSum):
        """ turn a docsum node in to a mask
        """
        un = mask({}, self.parent.mm.idpatterns)
        cid = next(mmquery.nodecontents(docSum.getElementsByTagName("Id")))
        un.append('cid', cid, self.parent.confid, self.parent.sourceid)
        p = re.compile('(<a href[^>]*>)|(</a>)|(ligand)|(,)')
        cnf = self.parent.confid
        src = self.parent.sourceid
        weak = self.parent.mm.confidence['weak']

        for item in docSum.getElementsByTagName('Item'):
            if len(item.childNodes) == 0:
                continue
            nVal = item.childNodes[0].nodeValue
            if item.getAttribute('Name') == 'SynonymList':
                synonyms = list(mmquery.nodecontents(item.childNodes))
                for s in synonyms:
                    s = p.sub('', s.strip())
                    if 'kegg' in guessTable(s, self.parent.mm.idpatterns):
                        un.append('kegg', s, cnf, src)
                        continue
                    if 'cas' in guessTable(s, self.parent.mm.idpatterns):
                        un.append('cas', s, cnf, src)
                        continue
                    if 'chebi' in guessTable(s, self.parent.mm.idpatterns):
                        un.append('chebi', s, cnf, src)
                        continue
                    if 'inchi' in guessTable(s, self.parent.mm.idpatterns):
                        un.append('inchi', s, cnf, src)
                        continue
                    else:
                        un.append('synonym', s, weak, src)
            elif item.getAttribute('Name') == 'IUPACName':
                un.append('iupac', nVal, cnf, src)
            elif item.getAttribute('Name') == 'CanonicalSmile':
                un.append('smiles', nVal, cnf, src)
            elif item.getAttribute('Name') == 'CanonicalSmile':
                un.append('smiles', nVal, cnf, src)
            elif item.getAttribute('Name') == 'InChIKey':
                un.append('inchikey', nVal, cnf, src)

            # annotation section
            elif item.getAttribute('Name') == 'MolecularFormula':
                un.append('formula', nVal, weak, src)
            elif item.getAttribute('Name') == 'MolecularWeight':
                un.append('weight', nVal, weak, src)
            elif item.getAttribute('Name') == 'TotalFormalCharge':
                un.append('totalcharge', nVal, weak, src)
            elif item.getAttribute('Name') == 'XLogP':
                un.append('xlogp', nVal, weak, src)
            elif item.getAttribute('Name') == 'XLogP':
                un.append('xlogp', nVal, weak, src)
            elif item.getAttribute('Name') == 'HydrogenBondDonorCount':
                un.append('hbonddonor', nVal, weak, src)
            elif item.getAttribute('Name') == 'HydrogenBondAcceptorCount':
                un.append('hbondacceptor', nVal, weak, src)
            elif item.getAttribute('Name') == 'HeavyAtomCount':
                un.append('heavyatom', nVal, weak, src)
            elif item.getAttribute('Name') == 'TPSA':
                un.append('tpsa', nVal, weak, src)
        return (un)