def process(self): """ parse the Golm Metabolome Database standards library. such as those found at http://csbdb.mpimp-golm.mpg.de/csbdb/gmd/msri/gmd_msri.html """ parent = self.parent parent.mm.setTableWeak('mpimpclass') # make sure that the necessary tables are in the db un = mask({}, parent.mm.idpatterns) l = parent.getLine() while l: # check and send off gathered data if re.match('^Name:', l): if un.nid(parent.mm) > 1: parent.setMask(un) un = mask({}, parent.mm.idpatterns) x = re.findall('Name: (.+)', l) mpimpid = x[0].strip() un.append('mpimp', mpimpid, parent.confid, parent.sourceid) # get synonyms from the title, the other synonyms are # just to full of crap spam = mpimpid.split('_') if spam[5].strip() != '': un.append('synonym', spam[5], parent.confid, parent.sourceid) if re.match(".*\([0-9]+[tT][mM][sS]\)$", spam[5]): syn = re.findall("(.*)\([0-9]+[tT][mM][sS]\)$", spam[5])[0].strip() un.append('synonym', syn, parent.confid, parent.sourceid) # ***** gather info ***** # mpimp class x = re.findall('Synonym: Metabolite \(class\):(.+)', l) if x: un.append('mpimpclass', x[0].strip(), parent.mm.confidence['weak'], parent.sourceid) # cas 1 x = re.findall('CASNO: (.+)', l) if x: un.append('cas', x[0].strip(), parent.confid, parent.sourceid) # cas 2 x = re.findall('CAS\|(\d{1,7}-\d{2}-\d{1})_\(.+\)', l) if x: un.append('cas', x[0].strip(), parent.confid, parent.sourceid) # kegg x = re.findall('KEGG\|([A-Z][0-9]{5})_.*\(.+\)', l) if x: un.append('kegg', x[0].strip(), parent.confid, parent.sourceid) l = parent.getLine() # send of the last mask if we have one if un.nid(parent.mm) > 1: parent.setMask(un)
def process(self): parent = self.parent ll = True while ll: ll = parent.getLine() if not ll: break tmp = mask({}) tmp.append('_id', ll) tmp2 = parent.mm.getMask(tmp, weak=True) if not tmp2: continue else: un = tmp2[0] if self.cs.fixInchi(un): parent.setMask(un, setass=False) tmp2 = parent.mm.getMask(tmp, weak=True) if tmp2: un = tmp2[0] try: csMasks = self.cs.getCsMasks(un, parent.mm) except: pdb.set_trace() if parent.mm.debug: print("#COMMENT mask " + str(ll) + " csid " + str(list(csMasks.keys()))) cmsk = mask({}) for csid in list(csMasks.keys()): csMasks[csid].setAllAssoc(parent.mm.addAss()) cmsk.merge(csMasks[csid]) parent.setMask(cmsk, setass=False)
def process(self): ll = True self.parent.mm.setTableWeak('formula') while ll: ll = self.parent.getLine() if not ll: break tmp = mask({}) tmp.append('_id', ll) tmp2 = self.parent.mm.getMask(tmp) if not tmp2: continue else: un = tmp2[0] try: chebiMasks = self.chebip.getChebiMasks(un, self.parent.mm) except: pdb.set_trace() if self.parent.mm.debug: print("#COMMENT mask " + str(ll) + " chebi " + str(list(chebiMasks.keys()))) chMask = mask({}) for ch in list(chebiMasks.keys()): chebiMasks[ch].setAllAssoc(self.parent.mm.addAss()) chMask.merge(chebiMasks[ch]) self.parent.setMask(chMask, setass=False)
def process(self): parent = self.parent weakTables = ['formula', 'weight', 'totalcharge', 'xlogp', 'hbonddonor', 'hbondacceptor', 'heavyatom', 'tpsa'] list([parent.mm.setTableWeak(x) for x in weakTables]) ll = True while ll: ll = parent.getLine() if not ll: break tmp = mask({}) tmp.append('_id', ll) tmp2 = parent.mm.getMask(tmp, weak=True) if not tmp2: continue else: un = tmp2[0] try: pcMasks = self.pc.getPubchemMasks(un) except: print("#COMMENT unknown error") continue if parent.mm.debug: print("#COMMENT mask " + str(ll)) msk = mask({}) for pcmsk in pcMasks: pcmsk.setAllAssoc(parent.mm.addAss()) msk.merge(pcmsk) parent.setMask(msk, setass=False)
def process(self): parent = self.parent weakTables = [ 'formula', 'weight', 'totalcharge', 'xlogp', 'hbonddonor', 'hbondacceptor', 'heavyatom', 'tpsa' ] list([parent.mm.setTableWeak(x) for x in weakTables]) ll = True while ll: ll = parent.getLine() if not ll: break tmp = mask({}) tmp.append('_id', ll) tmp2 = parent.mm.getMask(tmp, weak=True) if not tmp2: continue else: un = tmp2[0] try: pcMasks = self.pc.getPubchemMasks(un) except: print("#COMMENT unknown error") continue if parent.mm.debug: print("#COMMENT mask " + str(ll)) msk = mask({}) for pcmsk in pcMasks: pcmsk.setAllAssoc(parent.mm.addAss()) msk.merge(pcmsk) parent.setMask(msk, setass=False)
def process(self): """ parse the compunds.dat file from the Cyc databases and insert the cycdb frame-id (uniqueid), synonyms, kegg and cas identifiers """ parent = self.parent un = mask({}, parent.mm.idpatterns) # new mask un.MIN_OVERLAP = 1 ll = parent.getLine(comment='#') while ll: ll = ll.strip() if ll == '//': if un.nid() > 2: try: parent.setMask(un) except: # parent.mm.setMask(un, debug=True) raise un = mask({}, parent.mm.idpatterns) un.MIN_OVERLAP = 1 # frameid x = re.findall("UNIQUE-ID - (.*)", ll) if x: un.append(self.uniqueid, x[0], parent.confid, parent.sourceid) # synonym x = re.findall("COMMON-NAME - (.*)", ll) if x: un.append('synonym', x[0], parent.mm.confidence['weak'], parent.sourceid) x = re.findall("SYNONYMS - (.*)", ll) if x: un.append('synonym', x[0], parent.mm.confidence['weak'], parent.sourceid) # cas x = re.findall("DBLINKS - \(CAS \"(\d{1,7}-\d{2}-\d{1})\"", ll) if x: un.append('cas', x[0], parent.confid, parent.sourceid) # inchi x = re.findall("INCHI - (.*)", ll) if x: un.append('inchi', x[0], parent.confid, parent.sourceid) # smiles x = re.findall("SMILES - (.*)", ll) if x: un.append('smiles', x[0], parent.mm.confidence['weak'], parent.sourceid) # kegg x = re.findall("DBLINKS - \(LIGAND-CPD \"([A-Z][0-9]{5})\"", ll) if x: un.append('kegg', x[0], parent.confid, parent.sourceid) ll = parent.getLine(comment='#') if un.nid() > 2: parent.setMask(un)
def GetCompoundInfo(self, mm, csid): """ take an csid, get a mask parameters: -`mm`: a metmask database -`csid`: a chemspider identifier """ tmpmask = mask({}, mm.idpatterns) csid = self.parent.urlSafe(csid) url = self.GetCompoundInfoURL + "CSID=" + str(csid) + "&token=" + self.token qRes = self.parent.getUrl(url) if not qRes: return (tmpmask) tmpmask.append('chemspider', csid, self.parent.confid, self.parent.sourceid) searchResults = xml.dom.minidom.parse(qRes) ids = \ list(mmquery.nodecontents(searchResults.getElementsByTagName('CompoundInfo'))) inchi = list(mmquery.nodecontents(searchResults.getElementsByTagName('InChI'))) if inchi: for ide in inchi: tmpmask.append('inchi', ide, \ self.parent.confid, self.parent.sourceid) smiles = \ list(mmquery.nodecontents(searchResults.getElementsByTagName('SMILES'))) if smiles: for ide in smiles: tmpmask.append('smiles', ide, self.parent.confid, self.parent.sourceid) inchikey = \ list(mmquery.nodecontents(searchResults.getElementsByTagName('InChIKey'))) if inchikey: for ide in inchikey: tmpmask.append('inchikey', ide, self.parent.confid, self.parent.sourceid) return (tmpmask)
def getCsMasks(self, un, mm): """ take a mask and get filled mask from cs """ res = {} tmpmask = mask({}, mm.idpatterns) if not any([x in self.queryTables for x in un.getTables()]): return (res) if un.hasTable('chemspider'): for csid in un.getIdentifiers('chemspider'): res[csid] = self.GetCompoundInfo(mm, csid) tmpmask.merge(res[csid]) def filteredids(table, un, tmpmask): identifiers = [] if un.hasTable(table): identifiers = un.getIdentifiers(table) if tmpmask.hasTable(table): identifiers = [x for x in identifiers if x \ not in tmpmask.getIdentifiers(table)] return (identifiers) def fillresandmerge(ide, tmpmask, res): newids = self.SimpleSearch(ide) for csid in newids: if csid not in res: res[csid] = self.GetCompoundInfo(mm, csid) tmpmask.merge(res[csid]) for ide in filteredids('smiles', un, tmpmask): fillresandmerge(ide, tmpmask, res) return (res)
def process (self) : """ parse an SDF file and import any NISTID, synonyms, CASNO and formula """ parent = self.parent un = mask({}, parent.mm.idpatterns) ll = parent.getLine() parent.mm.setTableWeak('formula') while ll : if ll.startswith('$$$$') : if un.nid(parent.mm) > 1 : parent.setMask(un) un = mask({}, parent.mm.idpatterns) # synonym if ll.startswith('> <NAME>') or ll.startswith('> <SYNONYMS>'): tmp = parent.getLine().strip() while tmp != "" : un.append('synonym', tmp, parent.confid, parent.sourceid) tmp = parent.getLine().strip() # nist if ll.startswith('> <NISTNO>') : tmp = parent.getLine().strip() while tmp != "" : un.append('nist', tmp, parent.confid, parent.sourceid) tmp = parent.getLine().strip() # cas if ll.startswith('> <CASNO>') : tmp = parent.getLine().strip() while tmp != "" : un.append('cas', tmp, parent.confid, parent.sourceid) tmp = parent.getLine().strip() # formula if ll.startswith('> <FORMULA>') : tmp = parent.getLine().strip() while tmp != "" : un.append('formula', tmp, parent.mm.confidence['weak'], parent.sourceid) tmp = parent.getLine().strip() ll = parent.getLine() if un.nid(parent.mm) > 1 : parent.setMask(un)
def process(self): """ parse an SDF file and import any NISTID, synonyms, CASNO and formula """ parent = self.parent un = mask({}, parent.mm.idpatterns) ll = parent.getLine() parent.mm.setTableWeak('formula') while ll: if ll.startswith('$$$$'): if un.nid(parent.mm) > 1: parent.setMask(un) un = mask({}, parent.mm.idpatterns) # synonym if ll.startswith('> <NAME>') or ll.startswith('> <SYNONYMS>'): tmp = parent.getLine().strip() while tmp != "": un.append('synonym', tmp, parent.confid, parent.sourceid) tmp = parent.getLine().strip() # nist if ll.startswith('> <NISTNO>'): tmp = parent.getLine().strip() while tmp != "": un.append('nist', tmp, parent.confid, parent.sourceid) tmp = parent.getLine().strip() # cas if ll.startswith('> <CASNO>'): tmp = parent.getLine().strip() while tmp != "": un.append('cas', tmp, parent.confid, parent.sourceid) tmp = parent.getLine().strip() # formula if ll.startswith('> <FORMULA>'): tmp = parent.getLine().strip() while tmp != "": un.append('formula', tmp, parent.mm.confidence['weak'], parent.sourceid) tmp = parent.getLine().strip() ll = parent.getLine() if un.nid(parent.mm) > 1: parent.setMask(un)
def process(self): """ parse the RIKEN standards library file """ parent = self.parent un = mask({}, parent.mm.idpatterns) ll = parent.getLine() parent.mm.setTableWeak('formula') while ll: x = re.findall('^Name: (.+)', ll) if x: if un.nid() > 1: parent.setMask(un) un = mask({}, parent.mm.idpatterns) un.append('riken', x[0].strip(), parent.confid, parent.sourceid) x = re.findall('CASNO: (\d{1,7}-\d{2}-\d{1})', ll) if x: un.append('cas', x[0].strip(), parent.confid, parent.sourceid) x = re.findall('^KEGG: ([A-Z][0-9]{5})', ll) if x: un.append('kegg', x[0].strip(), parent.confid, parent.sourceid) x = re.findall('^Formula: (.+)', ll) if x: un.append('formula', x[0].strip(), parent.mm.confidence['weak'], parent.sourceid) x = re.findall('^SMILES: (.+)', ll) if x: un.append('smiles', x[0].strip(), parent.mm.confidence['weak'], parent.sourceid) x = re.findall('Synonym: Name:(.+)', ll) if x: un.append('synonym', x[0].strip(), parent.confid, parent.sourceid) ll = parent.getLine() if un.nid() > 1: parent.setMask(un)
def process(self): """ parse a file with metabocards from hmdb """ parent = self.parent un = mask({}, parent.mm.idpatterns) ll = parent.getLine(comment='\n') if not ll.startswith("#BEGIN_METABOCARD"): raise parserError("file does seem to contain metabocards: " + ll) parent.mm.setTableWeak('formula') ok = False while ll: if ll.startswith('#END_METABOCARD'): if un.nid(parent.mm) > 1: parent.setMask(un) elif ll.startswith("#BEGIN_METABOCARD"): un = mask({}, parent.mm.idpatterns) elif ll.startswith("#"): ok = False if ll.strip() in self.tableDict: con = self.tableDict[ll.strip()][0] tab = self.tableDict[ll.strip()][1] ok = True else: if ok: identifiers = ll.split("; ") for iden in identifiers: if not iden.lower().startswith("not available"): try: un.append(tab, iden, con, parent.sourceid) except idMisMatchError: print("#OFFENDING LINE " + \ str(parent.lineNum) + "@" + \ tab + " : " + str(iden)) ll = parent.getLine(comment='\n')
def test_insert_delete_mask(memory_db): memory_db.createIdTable('table1') memory_db.createIdTable('table2') un = mask() un.append('table1', 'foo', 'strong', 'test', 'baz') un.append('table2', 'bar', 'strong', 'test', 'baz') memory_db.setMask(un) result = memory_db.simpleQuery('foo', 'table1', 'table2') assert 'bar' in result[0][0] memory_db.dropMask(un) result = memory_db.simpleQuery('foo', 'table1', 'table2') assert len(result[0][0]) == 0 memory_db.close() assert not os.path.exists(':memory:')
def process(self): parent = self.parent ncol = len(parent.tables) ## single identifier inserts should be allowed since they can ## might map to bins # if ncol < 2: # raise fileFormatError, "Only one column, pointless insertion" # make sure that the necessary tables are in the db ll = parent.getLine() ll = ll.replace("#", "\\#") while ll: ll = ll.replace("#", "\\#") try: vec = fixLine(ll, sep=parent.sep1) if len(vec) != ncol: raise fileFormatError("Number of columns doesn't match the header :" \ + str(parent.lineNum) + ll) un = mask({}, parent.mm.idpatterns) for i in range(0, len(vec)): idvec = vec[i].strip().split(parent.sep2) idvec[0].strip() if idvec[0] and not re.match(parent.na, idvec[0]): for ide in idvec: ide = ide.strip() if not re.match(parent.na, ide): try: un.append(parent.tables[i], ide, self.confids[i], parent.sourceid) except idMisMatchError: print("#OFFENDING LINE " + \ str(parent.lineNum) + "@" + \ parent.tables[i] + \ " : " + str(ide)) # no empty masks if not un.isEmpty(): parent.setMask(un) except KeyboardInterrupt: raise Exception("Interrupt caught, breaking") except fileFormatError: print("#ERROR: format problem" + ll) except: print("#ERROR:" + ll) ll = parent.getLine()
def GetCompoundInfo(self, mm, csid): """ take an csid, get a mask parameters: -`mm`: a metmask database -`csid`: a chemspider identifier """ tmpmask = mask({}, mm.idpatterns) csid = self.parent.urlSafe(csid) url = self.GetCompoundInfoURL + "CSID=" + str( csid) + "&token=" + self.token qRes = self.parent.getUrl(url) if not qRes: return (tmpmask) tmpmask.append('chemspider', csid, self.parent.confid, self.parent.sourceid) searchResults = xml.dom.minidom.parse(qRes) ids = \ list(mmquery.nodecontents(searchResults.getElementsByTagName('CompoundInfo'))) inchi = list( mmquery.nodecontents(searchResults.getElementsByTagName('InChI'))) if inchi: for ide in inchi: tmpmask.append('inchi', ide, \ self.parent.confid, self.parent.sourceid) smiles = \ list(mmquery.nodecontents(searchResults.getElementsByTagName('SMILES'))) if smiles: for ide in smiles: tmpmask.append('smiles', ide, self.parent.confid, self.parent.sourceid) inchikey = \ list(mmquery.nodecontents(searchResults.getElementsByTagName('InChIKey'))) if inchikey: for ide in inchikey: tmpmask.append('inchikey', ide, self.parent.confid, self.parent.sourceid) return (tmpmask)
def process(self): """ parse a cyc dump file and fectch cas, kegg, synonym and cycpath """ parent = self.parent last = "" parent.mm.setTableWeak('cycpath') parent.mm.setTableWeak('formula') l = parent.getLine() while l: l = l.replace("'", "\\'") # cyc likes apostrophes l = l.replace("#", "\#") # hashes in the middle of lines vec = fixLine(l, sep=parent.sep1) if len(vec) < 6: l = parent.getLine() continue if len(vec) == 9: vec = [vec[0]] + [vec[1]] + [vec[5]] + [vec[8].strip()] + [vec[3]] + [vec[4]] else: vec = [vec[0]] + [vec[1]] + [vec[5]] + [''] + [vec[3]] + [vec[4]] if vec == last: l = parent.getLine() continue un = mask({}, parent.mm.idpatterns) if len(vec) == 6: # pointless if we dont have links if vec[2] != '': # cas x = re.findall('CAS:(\d{1,7}-\d{2}-\d{1})', vec[2]) for casno in x: un.append('cas', casno, parent.confid, parent.sourceid) # kegg x = re.findall('LIGAND-CPD:([A-Z][0-9]{5})', vec[2]) for kegg in x: un.append('kegg', kegg, parent.confid, parent.sourceid) # pubchem x = re.findall('PUBCHEM:([0-9]+)', vec[2]) for kegg in x: un.append('pubchem', kegg, parent.confid, parent.sourceid) # knapsack x = re.findall('KNAPSACK:([0-9]+)', vec[2]) for kegg in x: un.append('knapsack', kegg, parent.confid, parent.sourceid) # formula if vec[4] != '': un.append('formula', vec[4].replace(" ", ""), parent.mm.confidence['weak'], parent.sourceid) # smiles if vec[5] != '': un.append('smiles', vec[5], parent.mm.confidence['weak'], parent.sourceid) # synonym un.append('synonym', vec[0], parent.mm.confidence['weak'], parent.sourceid) if vec[1] != '': x = vec[1].split("*") for syn in x: un.append('synonym', syn, parent.mm.confidence['weak'], parent.sourceid) if vec[3] != '': un.append('cycpath', vec[3], parent.mm.confidence['weak'], parent.sourceid) if un.nid() < 2 or \ all([not un.hasTable('cas'), not un.hasTable('kegg')]): un = mask({}, parent.mm.idpatterns) if un.nid() > 1: parent.setMask(un) last = vec l = parent.getLine()
def pubchem2mask(self, docSum): """ turn a docsum node in to a mask """ un = mask({}, self.parent.mm.idpatterns) cid = next(mmquery.nodecontents(docSum.getElementsByTagName("Id"))) un.append('cid', cid, self.parent.confid, self.parent.sourceid) p = re.compile('(<a href[^>]*>)|(</a>)|(ligand)|(,)') cnf = self.parent.confid src = self.parent.sourceid weak = self.parent.mm.confidence['weak'] for item in docSum.getElementsByTagName('Item'): if len(item.childNodes) == 0: continue nVal = item.childNodes[0].nodeValue if item.getAttribute('Name') == 'SynonymList': synonyms = list(mmquery.nodecontents(item.childNodes)) for s in synonyms: s = p.sub('', s.strip()) if 'kegg' in guessTable(s, self.parent.mm.idpatterns): un.append('kegg', s, cnf, src) continue if 'cas' in guessTable(s, self.parent.mm.idpatterns): un.append('cas', s, cnf, src) continue if 'chebi' in guessTable(s, self.parent.mm.idpatterns): un.append('chebi', s, cnf, src) continue if 'inchi' in guessTable(s, self.parent.mm.idpatterns): un.append('inchi', s, cnf, src) continue else: un.append('synonym', s, weak, src) elif item.getAttribute('Name') == 'IUPACName': un.append('iupac', nVal, cnf, src) elif item.getAttribute('Name') == 'CanonicalSmile': un.append('smiles', nVal, cnf, src) elif item.getAttribute('Name') == 'CanonicalSmile': un.append('smiles', nVal, cnf, src) elif item.getAttribute('Name') == 'InChIKey': un.append('inchikey', nVal, cnf, src) # annotation section elif item.getAttribute('Name') == 'MolecularFormula': un.append('formula', nVal, weak, src) elif item.getAttribute('Name') == 'MolecularWeight': un.append('weight', nVal, weak, src) elif item.getAttribute('Name') == 'TotalFormalCharge': un.append('totalcharge', nVal, weak, src) elif item.getAttribute('Name') == 'XLogP': un.append('xlogp', nVal, weak, src) elif item.getAttribute('Name') == 'XLogP': un.append('xlogp', nVal, weak, src) elif item.getAttribute('Name') == 'HydrogenBondDonorCount': un.append('hbonddonor', nVal, weak, src) elif item.getAttribute('Name') == 'HydrogenBondAcceptorCount': un.append('hbondacceptor', nVal, weak, src) elif item.getAttribute('Name') == 'HeavyAtomCount': un.append('heavyatom', nVal, weak, src) elif item.getAttribute('Name') == 'TPSA': un.append('tpsa', nVal, weak, src) return (un)
def chebi2mask(self, mm, chebiId): """ get a mask containing the info associated with a chebi id 2. use getComplete to fetch the contents of the relevant entries """ ba = chebiId un = mask({}, mm.idpatterns) qUrl = self.getComplete + "chebiId=" + str(chebiId) qRes = self.parent.getUrl(qUrl) if not qRes: return (mask({})) searchResults = xml.dom.minidom.parse(qRes) if not searchResults: return (mask({})) if searchResults.getElementsByTagName('ns1:return'): retList = searchResults.getElementsByTagName('ns1:return')[0] # found non-existent chebiId else: # this would have been reasonable but chebi has problems so that some # entries exist, but cant be fetched. valid chebi but we cant query # for it, # hence, skip completely # delmask = mask({}) # delmask.append('chebi', chebiId) # mm.brandish(delmask) return (mask({})) # chebiid if retList.getElementsByTagName("ns1:chebiId"): chebiId = list(query.nodecontents(retList.getElementsByTagName("ns1:chebiId")))[0] un.append('chebi', chebiId.replace("CHEBI:", ""), self.parent.confid, self.parent.sourceid) # smiles if retList.getElementsByTagName("ns1:smiles"): un.append('smiles', list(query.nodecontents(retList.getElementsByTagName("ns1:smiles")))[0], self.parent.confid, self.parent.sourceid) # synonym if searchResults.getElementsByTagName('ns1:Synonyms'): syns = searchResults.getElementsByTagName('ns1:Synonyms')[0] for sy in list(query.nodecontents(syns.getElementsByTagName("ns1:data"))): un.append('synonym', sy, self.parent.mm.confidence['weak'], \ self.parent.sourceid) # inchi if retList.getElementsByTagName("ns1:inchi"): un.append('inchi', list(query.nodecontents(retList.getElementsByTagName("ns1:inchi")))[0], self.parent.confid, self.parent.sourceid) # iupac if searchResults.getElementsByTagName('ns1:IupacNames'): syns = searchResults.getElementsByTagName('ns1:IupacNames')[0] for sy in list(query.nodecontents(syns.getElementsByTagName("ns1:data"))): un.append('iupac', sy, self.parent.confid, \ self.parent.sourceid) # kegg for ll in searchResults.getElementsByTagName('ns1:DatabaseLinks'): if list(query.nodecontents(ll.getElementsByTagName("ns1:type")))[0] == 'KEGG COMPOUND accession': for sy in list(query.nodecontents(ll.getElementsByTagName("ns1:data"))): un.append('kegg', sy, self.parent.confid, self.parent.sourceid) # cas for ll in searchResults.getElementsByTagName('ns1:RegistryNumbers'): if list(query.nodecontents(ll.getElementsByTagName("ns1:type")))[0] == 'CAS Registry Number': for sy in list(query.nodecontents(ll.getElementsByTagName("ns1:data"))): un.append('cas', sy, self.parent.confid, self.parent.sourceid) # pdb.set_trace() # formula if searchResults.getElementsByTagName('ns1:Formulae'): form = searchResults.getElementsByTagName('ns1:Formulae')[0] for sy in list(query.nodecontents(form.getElementsByTagName("ns1:data"))): un.append('formula', sy, self.parent.mm.confidence['weak'], \ self.parent.sourceid) return (un)
def getChebiMasks(self, un, mm): """ 1. use getLite to query for all relevant entries in the mask 2. Query for chebi, iupac, cas, kegg, inchi, smiles 1. Get only exact matches 3. Take the unique ones 4. Get all is_a children of the found chebis and add them Result is a list of masks """ res = {} # pdb.set_trace() tmpmask = mask({}, mm.idpatterns) if not any([x in self.queryTables for x in un.getTables()]): return (res) if un.hasTable('chebi'): for ch in un.getIdentifiers('chebi'): res[ch] = self.chebi2mask(mm, ch) tmpmask.merge(res[ch]) def filteredids(table, un, tmpmask): identifiers = [] if un.hasTable(table): identifiers = un.getIdentifiers(table) if tmpmask.hasTable(table): # to start querying for the same identifiers several times identifiers = [x for x in identifiers if x not in tmpmask.getIdentifiers(table)] return (identifiers) def fillresandmerge(qUrl, tmpmask, res): newids = self.url2ids(qUrl) for ch in newids: if ch not in res: res[ch] = self.chebi2mask(mm, ch) tmpmask.merge(res[ch]) for ide in filteredids('iupac', un, tmpmask): qUrl = self.getLite + "search=" + \ self.parent.urlSafe(ide) + "&searchCategory=IUPAC+NAME" fillresandmerge(qUrl, tmpmask, res) for ide in filteredids('cas', un, tmpmask): qUrl = self.getLite + "search=" + \ self.parent.urlSafe(ide) + "&searchCategory=REGISTRY+NUMBER" fillresandmerge(qUrl, tmpmask, res) for ide in filteredids('kegg', un, tmpmask): qUrl = self.getLite + "search=" + \ self.parent.urlSafe(ide) + "&searchCategory=DATABASE+LINK" fillresandmerge(qUrl, tmpmask, res) for ide in filteredids('inchi', un, tmpmask): qUrl = self.getLite + "search=" + \ self.parent.urlSafe(ide) + "&searchCategory=INCHI" fillresandmerge(qUrl, tmpmask, res) for ide in filteredids('smiles', un, tmpmask): qUrl = self.getLite + "search=" + \ self.parent.urlSafe(ide) + "&searchCategory=SMILES" fillresandmerge(qUrl, tmpmask, res) foundIds = list(res.keys()) children = [] for ch in foundIds: children = self.getChebiChildren(ch) for child in children: if child not in res: res[child] = self.chebi2mask(mm, child) return (res)
def getChebiMasks(self, un, mm): """ 1. use getLite to query for all relevant entries in the mask 2. Query for chebi, iupac, cas, kegg, inchi, smiles 1. Get only exact matches 3. Take the unique ones 4. Get all is_a children of the found chebis and add them Result is a list of masks """ res = {} # pdb.set_trace() tmpmask = mask({}, mm.idpatterns) if not any([x in self.queryTables for x in un.getTables()]): return (res) if un.hasTable('chebi'): for ch in un.getIdentifiers('chebi'): res[ch] = self.chebi2mask(mm, ch) tmpmask.merge(res[ch]) def filteredids(table, un, tmpmask): identifiers = [] if un.hasTable(table): identifiers = un.getIdentifiers(table) if tmpmask.hasTable(table): # to start querying for the same identifiers several times identifiers = [ x for x in identifiers if x not in tmpmask.getIdentifiers(table) ] return (identifiers) def fillresandmerge(qUrl, tmpmask, res): newids = self.url2ids(qUrl) for ch in newids: if ch not in res: res[ch] = self.chebi2mask(mm, ch) tmpmask.merge(res[ch]) for ide in filteredids('iupac', un, tmpmask): qUrl = self.getLite + "search=" + \ self.parent.urlSafe(ide) + "&searchCategory=IUPAC+NAME" fillresandmerge(qUrl, tmpmask, res) for ide in filteredids('cas', un, tmpmask): qUrl = self.getLite + "search=" + \ self.parent.urlSafe(ide) + "&searchCategory=REGISTRY+NUMBER" fillresandmerge(qUrl, tmpmask, res) for ide in filteredids('kegg', un, tmpmask): qUrl = self.getLite + "search=" + \ self.parent.urlSafe(ide) + "&searchCategory=DATABASE+LINK" fillresandmerge(qUrl, tmpmask, res) for ide in filteredids('inchi', un, tmpmask): qUrl = self.getLite + "search=" + \ self.parent.urlSafe(ide) + "&searchCategory=INCHI" fillresandmerge(qUrl, tmpmask, res) for ide in filteredids('smiles', un, tmpmask): qUrl = self.getLite + "search=" + \ self.parent.urlSafe(ide) + "&searchCategory=SMILES" fillresandmerge(qUrl, tmpmask, res) foundIds = list(res.keys()) children = [] for ch in foundIds: children = self.getChebiChildren(ch) for child in children: if child not in res: res[child] = self.chebi2mask(mm, child) return (res)
def process(self): """ parse a cyc dump file and fectch cas, kegg, synonym and cycpath """ parent = self.parent last = "" parent.mm.setTableWeak('cycpath') parent.mm.setTableWeak('formula') l = parent.getLine() while l: l = l.replace("'", "\\'") # cyc likes apostrophes l = l.replace("#", "\#") # hashes in the middle of lines vec = fixLine(l, sep=parent.sep1) if len(vec) < 6: l = parent.getLine() continue if len(vec) == 9: vec = [vec[0]] + [vec[1]] + [vec[5]] + [vec[8].strip()] + [ vec[3] ] + [vec[4]] else: vec = [vec[0]] + [vec[1]] + [vec[5]] + [''] + [vec[3] ] + [vec[4]] if vec == last: l = parent.getLine() continue un = mask({}, parent.mm.idpatterns) if len(vec) == 6: # pointless if we dont have links if vec[2] != '': # cas x = re.findall('CAS:(\d{1,7}-\d{2}-\d{1})', vec[2]) for casno in x: un.append('cas', casno, parent.confid, parent.sourceid) # kegg x = re.findall('LIGAND-CPD:([A-Z][0-9]{5})', vec[2]) for kegg in x: un.append('kegg', kegg, parent.confid, parent.sourceid) # pubchem x = re.findall('PUBCHEM:([0-9]+)', vec[2]) for kegg in x: un.append('pubchem', kegg, parent.confid, parent.sourceid) # knapsack x = re.findall('KNAPSACK:([0-9]+)', vec[2]) for kegg in x: un.append('knapsack', kegg, parent.confid, parent.sourceid) # formula if vec[4] != '': un.append('formula', vec[4].replace(" ", ""), parent.mm.confidence['weak'], parent.sourceid) # smiles if vec[5] != '': un.append('smiles', vec[5], parent.mm.confidence['weak'], parent.sourceid) # synonym un.append('synonym', vec[0], parent.mm.confidence['weak'], parent.sourceid) if vec[1] != '': x = vec[1].split("*") for syn in x: un.append('synonym', syn, parent.mm.confidence['weak'], parent.sourceid) if vec[3] != '': un.append('cycpath', vec[3], parent.mm.confidence['weak'], parent.sourceid) if un.nid() < 2 or \ all([not un.hasTable('cas'), not un.hasTable('kegg')]): un = mask({}, parent.mm.idpatterns) if un.nid() > 1: parent.setMask(un) last = vec l = parent.getLine()
def chebi2mask(self, mm, chebiId): """ get a mask containing the info associated with a chebi id 2. use getComplete to fetch the contents of the relevant entries """ ba = chebiId un = mask({}, mm.idpatterns) qUrl = self.getComplete + "chebiId=" + str(chebiId) qRes = self.parent.getUrl(qUrl) if not qRes: return (mask({})) searchResults = xml.dom.minidom.parse(qRes) if not searchResults: return (mask({})) if searchResults.getElementsByTagName('ns1:return'): retList = searchResults.getElementsByTagName('ns1:return')[0] # found non-existent chebiId else: # this would have been reasonable but chebi has problems so that some # entries exist, but cant be fetched. valid chebi but we cant query # for it, # hence, skip completely # delmask = mask({}) # delmask.append('chebi', chebiId) # mm.brandish(delmask) return (mask({})) # chebiid if retList.getElementsByTagName("ns1:chebiId"): chebiId = list( query.nodecontents( retList.getElementsByTagName("ns1:chebiId")))[0] un.append('chebi', chebiId.replace("CHEBI:", ""), self.parent.confid, self.parent.sourceid) # smiles if retList.getElementsByTagName("ns1:smiles"): un.append( 'smiles', list( query.nodecontents( retList.getElementsByTagName("ns1:smiles")))[0], self.parent.confid, self.parent.sourceid) # synonym if searchResults.getElementsByTagName('ns1:Synonyms'): syns = searchResults.getElementsByTagName('ns1:Synonyms')[0] for sy in list( query.nodecontents(syns.getElementsByTagName("ns1:data"))): un.append('synonym', sy, self.parent.mm.confidence['weak'], \ self.parent.sourceid) # inchi if retList.getElementsByTagName("ns1:inchi"): un.append( 'inchi', list( query.nodecontents( retList.getElementsByTagName("ns1:inchi")))[0], self.parent.confid, self.parent.sourceid) # iupac if searchResults.getElementsByTagName('ns1:IupacNames'): syns = searchResults.getElementsByTagName('ns1:IupacNames')[0] for sy in list( query.nodecontents(syns.getElementsByTagName("ns1:data"))): un.append('iupac', sy, self.parent.confid, \ self.parent.sourceid) # kegg for ll in searchResults.getElementsByTagName('ns1:DatabaseLinks'): if list(query.nodecontents(ll.getElementsByTagName( "ns1:type")))[0] == 'KEGG COMPOUND accession': for sy in list( query.nodecontents( ll.getElementsByTagName("ns1:data"))): un.append('kegg', sy, self.parent.confid, self.parent.sourceid) # cas for ll in searchResults.getElementsByTagName('ns1:RegistryNumbers'): if list(query.nodecontents(ll.getElementsByTagName( "ns1:type")))[0] == 'CAS Registry Number': for sy in list( query.nodecontents( ll.getElementsByTagName("ns1:data"))): un.append('cas', sy, self.parent.confid, self.parent.sourceid) # pdb.set_trace() # formula if searchResults.getElementsByTagName('ns1:Formulae'): form = searchResults.getElementsByTagName('ns1:Formulae')[0] for sy in list( query.nodecontents(form.getElementsByTagName("ns1:data"))): un.append('formula', sy, self.parent.mm.confidence['weak'], \ self.parent.sourceid) return (un)
def process(self): parent = self.parent # annotate this table as fully weak parent.mm.setTableWeak('pathway') parent.mm.setTableWeak('formula') ll = parent.getLine() if not re.match("^ENTRY", ll): raise fileFormatError("COMPOUNDS file doesn't look like expected") un = mask({}, parent.mm.idpatterns) good = True while ll: # check and send off gathered data # kegg x = re.findall('ENTRY *([A-Z][0-9]{5}) ', ll) if x: if not un.isEmpty() and good: parent.setMask(un) un = mask({}, parent.mm.idpatterns) # new mask keggno = x[0].strip() un.append('kegg', keggno, parent.confid, parent.sourceid) good = True # synonym x = re.findall('NAME +(.+)', ll) if x: un.append('synonym', x[0].strip().replace(";", ""), parent.confid, parent.sourceid) while re.match(".+;$", ll): ll = parent.getLine() un.append('synonym', ll.strip().replace(";", ""), parent.confid, parent.sourceid) # formula x = re.findall('FORMULA +(.+)', ll) if x: un.append('formula', x[0], parent.mm.confidence['weak'], parent.sourceid) # knapsack x = re.findall('KNApSAcK: (.+)', ll) if x: un.append('knapsack', x[0], parent.confid, parent.sourceid) # cas if re.findall('CAS:', ll): x = re.findall('(\d{2,7}-\d{2}-\d{1})', ll) for xx in x: un.append('cas', xx, parent.confid, parent.sourceid) # sid <- obs not cid! x = re.findall('PubChem: (\d+)', ll) if x: un.append('sid', x[0], parent.confid, parent.sourceid) # chebi x = re.findall('ChEBI: (\d+)', ll) if x: un.append('chebi', x[0], parent.confid, parent.sourceid) # pathway x = re.findall('(PATHWAY)*\s*(ko|map)(\d+)', ll) if x: un.append('pathway', x[0][2], parent.mm.confidence['weak'], parent.sourceid) # comment, to get rid of kegg internal generics if re.match('ENTRY *[A-Z][0-9]{5}$ *Peptide *Compound$', ll): good = False if re.match('COMMENT', ll): if re.match('COMMENT generic compound in reaction hierarchy', ll) or \ re.match('COMMENT coordination compound', ll) or \ re.match('COMMENT.+R=.+', ll): good = False ll = parent.getLine() # send of the last mask if we have one if not un.isEmpty() and good: parent.setMask(un)