def test1SDSupplier(self): fileN = os.path.join(RDConfig.RDCodeDir, 'VLib', 'NodeLib', 'test_data', 'NCI_aids.10.sdf') suppl = Chem.SDMolSupplier(fileN) ms = [x for x in suppl] self.assertEqual(len(ms), 10) # test repeating: ms = [x for x in suppl] self.assertEqual(len(ms), 10) # test reset: suppl.reset() m = next(suppl) self.assertEqual(m.GetProp('_Name'), '48') self.assertEqual(m.GetProp('NSC'), '48') self.assertEqual(m.GetProp('CAS_RN'), '15716-70-8') m = next(suppl) self.assertEqual(m.GetProp('_Name'), '78') self.assertEqual(m.GetProp('NSC'), '78') self.assertEqual(m.GetProp('CAS_RN'), '6290-84-2') suppl.reset() for _ in range(10): m = next(suppl) with self.assertRaises(StopIteration): m = next(suppl)
def test2SmilesSupplier(self): fileN = os.path.join(RDConfig.RDCodeDir, 'VLib', 'NodeLib', 'test_data', 'pgp_20.txt') suppl = Chem.SmilesMolSupplier(fileN, delimiter='\t', smilesColumn=2, nameColumn=1, titleLine=1) ms = [x for x in suppl] assert len(ms) == 20 # test repeating: ms = [x for x in suppl] assert len(ms) == 20 # test reset: suppl.reset() m = next(suppl) assert m.GetProp('_Name') == 'ALDOSTERONE' assert m.GetProp('ID') == 'RD-PGP-0001' m = next(suppl) assert m.GetProp('_Name') == 'AMIODARONE' assert m.GetProp('ID') == 'RD-PGP-0002' suppl.reset() for i in range(20): m = next(suppl) try: m = next(suppl) except StopIteration: ok = 1 else: ok = 0 assert ok
def SupplierFromDetails(details): from rdkit.VLib.NodeLib.DbMolSupply import DbMolSupplyNode from rdkit.VLib.NodeLib.SmilesSupply import SmilesSupplyNode if details.dbName: conn = DbConnect(details.dbName, details.tableName) suppl = DbMolSupplyNode(conn.GetData()) else: suppl = SmilesSupplyNode(details.inFileName, delim=details.delim, nameColumn=details.nameCol, smilesColumn=details.smiCol, titleLine=details.hasTitle) if isinstance(details.actCol, int): suppl.reset() m = next(suppl) actName = m.GetPropNames()[details.actCol] details.actCol = actName if isinstance(details.nameCol, int): suppl.reset() m = next(suppl) nameName = m.GetPropNames()[details.nameCol] details.nameCol = nameName suppl.reset() if isinstance(details.actCol, int): suppl.reset() m = next(suppl) actName = m.GetPropNames()[details.actCol] details.actCol = actName if isinstance(details.nameCol, int): suppl.reset() m = next(suppl) nameName = m.GetPropNames()[details.nameCol] details.nameCol = nameName suppl.reset() return suppl
def test2SmilesSupplier(self): fileN = os.path.join(RDConfig.RDCodeDir,'VLib','NodeLib','test_data','pgp_20.txt') suppl = Chem.SmilesMolSupplier(fileN,delimiter='\t',smilesColumn=2, nameColumn=1,titleLine=1) ms = [x for x in suppl] assert len(ms)==20 # test repeating: ms = [x for x in suppl] assert len(ms)==20 # test reset: suppl.reset() m = next(suppl) assert m.GetProp('_Name')=='ALDOSTERONE' assert m.GetProp('ID')=='RD-PGP-0001' m = next(suppl) assert m.GetProp('_Name')=='AMIODARONE' assert m.GetProp('ID')=='RD-PGP-0002' suppl.reset() for i in range(20): m = next(suppl) try: m = next(suppl) except StopIteration: ok=1 else: ok=0 assert ok
def test1SDSupplier(self): fileN = os.path.join(RDConfig.RDCodeDir,'VLib','NodeLib','test_data','NCI_aids.10.sdf') suppl = Chem.SDMolSupplier(fileN) ms = [x for x in suppl] assert len(ms)==10 # test repeating: ms = [x for x in suppl] assert len(ms)==10 # test reset: suppl.reset() m = next(suppl) assert m.GetProp('_Name')=='48' assert m.GetProp('NSC')=='48' assert m.GetProp('CAS_RN')=='15716-70-8' m = next(suppl) assert m.GetProp('_Name')=='78' assert m.GetProp('NSC')=='78' assert m.GetProp('CAS_RN')=='6290-84-2' suppl.reset() for i in range(10): m = next(suppl) try: m = next(suppl) except StopIteration: ok=1 else: ok=0 assert ok
def test1SDSupplier(self): fileN = os.path.join(RDConfig.RDCodeDir, 'VLib', 'NodeLib', 'test_data', 'NCI_aids.10.sdf') suppl = Chem.SDMolSupplier(fileN) ms = [x for x in suppl] assert len(ms) == 10 # test repeating: ms = [x for x in suppl] assert len(ms) == 10 # test reset: suppl.reset() m = next(suppl) assert m.GetProp('_Name') == '48' assert m.GetProp('NSC') == '48' assert m.GetProp('CAS_RN') == '15716-70-8' m = next(suppl) assert m.GetProp('_Name') == '78' assert m.GetProp('NSC') == '78' assert m.GetProp('CAS_RN') == '6290-84-2' suppl.reset() for i in range(10): m = next(suppl) try: m = next(suppl) except StopIteration: ok = 1 else: ok = 0 assert ok
def testOrderBug(self): sdFile = os.path.join(RDConfig.RDCodeDir, 'Chem', 'Pharm2D', 'test_data', 'orderBug.sdf') suppl = Chem.SDMolSupplier(sdFile) m1 = next(suppl) m2 = next(suppl) sig1 = Generate.Gen2DFingerprint(m1, self.factory) sig2 = Generate.Gen2DFingerprint(m2, self.factory) self.assertEqual(sig1, sig2)
def test_SmilesReaderIndex(self): # tests lazy reads supp = Chem.SmilesMolSupplierFromText('\n'.join(self.smis), ',', 0, -1, 0) for i in range(4): self.assertMolecule(next(supp), i) i = len(supp) - 1 self.assertMolecule(supp[i], i) # Use in a list comprehension ms = [Chem.MolToSmiles(mol) for mol in supp] self.assertEqual(ms, self.smis) self.assertEqual(len(supp), self.nMolecules, 'bad supplier length') # Despite iterating through the whole supplier, we can still access by index i = self.nMolecules - 3 self.assertMolecule(supp[i - 1], i, msg='back index: ') with self.assertRaises(IndexError): _ = supp[self.nMolecules] # out of bound read must fail # and we can access with negative numbers mol1 = supp[len(supp) - 1] mol2 = supp[-1] self.assertEqual(Chem.MolToSmiles(mol1), Chem.MolToSmiles(mol2))
def test1LazyReader(self): " tests lazy reads """ supp = Chem.SmilesMolSupplierFromText('\n'.join(self.smis),',',0,-1,0) for i in range(4): m = next(supp) assert m,'read %d failed'%i assert m.GetNumAtoms(),'no atoms in mol %d'%i i = len(supp)-1 m = supp[i] assert m,'read %d failed'%i assert m.GetNumAtoms(),'no atoms in mol %d'%i ms = [x for x in supp] for i in range(len(supp)): m = ms[i] if m: ms[i] = Chem.MolToSmiles(m) l = len(supp) assert l == len(self.smis),'bad supplier length: %d'%(l) i = len(self.smis)-3 m = supp[i-1] assert m,'back index %d failed'%i assert m.GetNumAtoms(),'no atoms in mol %d'%i try: m = supp[len(self.smis)] except: fail = 1 else: fail = 0 assert fail,'out of bound read did not fail'
def _testStreamRoundtrip(self): inD = open(self.fName).read() supp = Chem.SDMolSupplier(self.fName) outName = tempfile.mktemp('.sdf') writer = Chem.SDWriter(outName) m1 = next(supp) for m in supp: writer.write(m) writer.flush() writer = None outD = open(outName,'r').read() try: os.unlink(outName) except Exception: import time time.sleep(1) try: os.unlink(outName) except Exception: pass assert inD.count('$$$$')==outD.count('$$$$'),'bad nMols in output' io = StringIO(outD) supp = Chem.SDMolSupplier(stream=io) outD2 = supp.Dump() assert outD2.count('$$$$')==len(supp),'bad nMols in output' assert outD2.count('$$$$')==outD.count('$$$$'),'bad nMols in output' assert outD2==outD,'bad outd'
def _testStreamRoundtrip(self): inD = open(self.fName).read() supp = Chem.SDMolSupplier(self.fName) outName = tempfile.mktemp('.sdf') writer = Chem.SDWriter(outName) m1 = next(supp) for m in supp: writer.write(m) writer.flush() writer = None outD = open(outName,'r').read() try: os.unlink(outName) except: import time time.sleep(1) try: os.unlink(outName) except: pass assert inD.count('$$$$')==outD.count('$$$$'),'bad nMols in output' io = StringIO(outD) supp = Chem.SDMolSupplier(stream=io) outD2 = supp.Dump() assert outD2.count('$$$$')==len(supp),'bad nMols in output' assert outD2.count('$$$$')==outD.count('$$$$'),'bad nMols in output' assert outD2==outD,'bad outd'
def _testReader(self): " tests reads using a file name " supp = Chem.SDMolSupplier(self.fName) for i in range(10): m = next(supp) assert m,'read %d failed'%i assert m.GetNumAtoms(),'no atoms in mol %d'%i i = 100 m = supp[i-1] assert m,'read %d failed'%i assert m.GetNumAtoms(),'no atoms in mol %d'%i l = len(supp) assert l == 200,'bad supplier length: %d'%(l) i = 12 m = supp[i-1] assert m,'back index %d failed'%i assert m.GetNumAtoms(),'no atoms in mol %d'%i try: m = supp[201] except IndexError: fail = 1 else: fail = 0 assert fail,'out of bound read did not fail'
def test_Writer(self): " tests writes using a file name " with open(self.fName,'r') as inf: inD = inf.read() supp = Chem.SDMolSupplier(self.fName) outName = tempfile.mktemp('.sdf') writer = Chem.SDWriter(outName) m1 = next(supp) writer.SetProps(m1.GetPropNames()) for m in supp: writer.write(m) writer.flush() writer = None with open(outName,'r') as inf: outD = inf.read() try: os.unlink(outName) except: import time time.sleep(1) try: os.unlink(outName) except: pass assert inD.count('$$$$')==outD.count('$$$$'),'bad nMols in output'
def test_SDMolSupplier(self): # tests reads using a file name (file contains 200 molecules) supp = Chem.SDMolSupplier(self.fName) # Can use as an iterator for i in range(10): mol = next(supp) self.assertMolecule(mol, i) # Can access directly i = 100 mol = supp[i - 1] self.assertMolecule(mol, i) # We can access the number of molecules self.assertEqual(len(supp), self.nMolecules, 'bad supplier length') # We know the number and can still access directly i = 12 mol = supp[i - 1] self.assertMolecule(mol, i) # Get an exception if we access an invalid number with self.assertRaises(IndexError): _ = supp[self.nMolecules] # out of bound read must fail # and we can access with negative numbers mol1 = supp[len(supp) - 1] mol2 = supp[-1] self.assertEqual(Chem.MolToSmiles(mol1), Chem.MolToSmiles(mol2))
def test1LazyReader(self): " tests lazy reads """ supp = Chem.SmilesMolSupplierFromText('\n'.join(self.smis),',',0,-1,0) for i in range(4): m = next(supp) assert m,'read %d failed'%i assert m.GetNumAtoms(),'no atoms in mol %d'%i i = len(supp)-1 m = supp[i] assert m,'read %d failed'%i assert m.GetNumAtoms(),'no atoms in mol %d'%i ms = [x for x in supp] for i in range(len(supp)): m = ms[i] if m: ms[i] = Chem.MolToSmiles(m) l = len(supp) assert l == len(self.smis),'bad supplier length: %d'%(l) i = len(self.smis)-3 m = supp[i-1] assert m,'back index %d failed'%i assert m.GetNumAtoms(),'no atoms in mol %d'%i with self.assertRaisesRegexp(Exception, ""): m = supp[len(self.smis)] # out of bound read must fail
def test_Writer(self): " tests writes using a file name " with open(self.fName,'r') as inf: inD = inf.read() supp = Chem.SDMolSupplier(self.fName) outName = tempfile.mktemp('.sdf') writer = Chem.SDWriter(outName) m1 = next(supp) writer.SetProps(m1.GetPropNames()) for m in supp: writer.write(m) writer.flush() # The writer does not have an explicit "close()" so need to # let the garbage collector kick in to close the file. writer = None with open(outName,'r') as inf: outD = inf.read() # The file should be closed, but if it isn't, and this # is Windows, then the unlink() can fail. Wait and try again. try: os.unlink(outName) except Exception: import time time.sleep(1) try: os.unlink(outName) except Exception: pass self.assertEqual(inD.count('$$$$'),outD.count('$$$$'),'bad nMols in output')
def test_SDWriter(self): # tests writes using a file name supp = Chem.SDMolSupplier(self.fName) _, outName = tempfile.mkstemp('.sdf') writer = Chem.SDWriter(outName) m1 = next(supp) writer.SetProps(m1.GetPropNames()) for m in supp: writer.write(m) writer.flush() writer.close() # The writer does not have an explicit "close()" so need to # let the garbage collector kick in to close the file. writer = None with open(outName, 'r') as inf: outD = inf.read() # The file should be closed, but if it isn't, and this # is Windows, then the unlink() can fail. Wait and try again. try: os.unlink(outName) except Exception: import time time.sleep(1) try: os.unlink(outName) except Exception: pass self.assertEqual(self.nMolecules, outD.count('$$$$'), 'bad nMols in output')
def ScoreFromLists(bitLists, suppl, catalog, maxPts=-1, actName='', acts=None, nActs=2, reportFreq=10): """ similar to _ScoreMolecules()_, but uses pre-calculated bit lists for the molecules (this speeds things up a lot) **Arguments** - bitLists: sequence of on bit sequences for the input molecules - suppl: the input supplier (we read activities from here) - catalog: the FragmentCatalog - maxPts: (optional) the maximum number of molecules to be considered - actName: (optional) the name of the molecule's activity property. If this is not provided, the molecule's last property will be used. - nActs: (optional) number of possible activity values - reportFreq: (optional) how often to display status information **Returns** the results table (a 3D array of ints nBits x 2 x nActs) """ nBits = catalog.GetFPLength() if maxPts > 0: nPts = maxPts else: nPts = len(bitLists) resTbl = numpy.zeros((nBits, 2, nActs), numpy.int) if not actName and not acts: actName = suppl[0].GetPropNames()[-1] suppl.reset() for i in range(1, nPts + 1): mol = next(suppl) if not acts: act = int(mol.GetProp(actName)) else: act = acts[i - 1] if i and not i % reportFreq: message('Done %d of %d\n' % (i, nPts)) ids = set() for id_ in bitLists[i - 1]: ids.add(id_ - 1) for j in range(nBits): resTbl[j, 0, act] += 1 for id_ in ids: resTbl[id_, 0, act] -= 1 resTbl[id_, 1, act] += 1 return resTbl
def test2SmilesSupplier(self): fileN = os.path.join(RDConfig.RDCodeDir, 'VLib', 'NodeLib', 'test_data', 'pgp_20.txt') suppl = Chem.SmilesMolSupplier(fileN, delimiter='\t', smilesColumn=2, nameColumn=1, titleLine=1) ms = [x for x in suppl] self.assertEqual(len(ms), 20) # test repeating: ms = [x for x in suppl] self.assertEqual(len(ms), 20) # test reset: suppl.reset() m = next(suppl) self.assertEqual(m.GetProp('_Name'), 'ALDOSTERONE') self.assertEqual(m.GetProp('ID'), 'RD-PGP-0001') m = next(suppl) self.assertEqual(m.GetProp('_Name'), 'AMIODARONE') self.assertEqual(m.GetProp('ID'), 'RD-PGP-0002') suppl.reset() for _ in range(20): m = next(suppl) with self.assertRaises(StopIteration): m = next(suppl)
def test7ConstrainedEmbedding(self): ofile = os.path.join(RDConfig.RDBaseDir, "Code", "GraphMol", "DistGeomHelpers", "test_data", "constrain1.sdf") suppl = Chem.SDMolSupplier(ofile) ref = next(suppl) probe = copy.deepcopy(ref) cMap = {} for i in range(5): cMap[i] = ref.GetConformer().GetAtomPosition(i) ci = rdDistGeom.EmbedMolecule(probe, coordMap=cMap, randomSeed=23) self.assertTrue(ci > -1) algMap = list(zip(range(5), range(5))) ssd = rdMolAlign.AlignMol(probe, ref, atomMap=algMap) self.assertTrue(ssd < 0.1)
def test7ConstrainedEmbedding(self): ofile = os.path.join(RDConfig.RDBaseDir, 'Code', 'GraphMol', 'DistGeomHelpers', 'test_data', 'constrain1.sdf') suppl = Chem.SDMolSupplier(ofile) ref = next(suppl) probe = copy.deepcopy(ref) cMap = {} for i in range(5): cMap[i] = ref.GetConformer().GetAtomPosition(i) ci = rdDistGeom.EmbedMolecule(probe, coordMap=cMap, randomSeed=23) self.assertTrue(ci > -1) algMap = list(zip(range(5), range(5))) ssd = rdMolAlign.AlignMol(probe, ref, atomMap=algMap) self.assertTrue(ssd < 0.1)
def _nextMatch(self): """ *Internal use only* """ done = 0 res = None sim = 0 while not done: # this is going to crap out when the data source iterator finishes, # that's how we stop when no match is found obj = six.next(self.dataIter) fp = self.fingerprinter(obj) sim = DataStructs.FingerprintSimilarity(fp, self.probe, self.metric) if sim >= self.threshold: res = obj done = 1 return sim, res
def RecapDecompose(mol, allNodes=None, minFragmentSize=0, onlyUseReactions=None): """ returns the recap decomposition for a molecule """ mSmi = Chem.MolToSmiles(mol, 1) if allNodes is None: allNodes = {} if mSmi in allNodes: return allNodes[mSmi] res = RecapHierarchyNode(mol) res.smiles = mSmi activePool = {mSmi: res} allNodes[mSmi] = res while activePool: nSmi = next(iterkeys(activePool)) node = activePool.pop(nSmi) if not node.mol: continue for rxnIdx, reaction in enumerate(reactions): if onlyUseReactions and rxnIdx not in onlyUseReactions: continue #print ' .',nSmi #print ' !!!!',rxnIdx,nSmi,reactionDefs[rxnIdx] ps = reaction.RunReactants((node.mol, )) #print ' ',len(ps) if ps: for prodSeq in ps: seqOk = True # we want to disqualify small fragments, so sort the product sequence by size # and then look for "forbidden" fragments tSeq = [(prod.GetNumAtoms(onlyExplicit=True), idx) for idx, prod in enumerate(prodSeq)] tSeq.sort() ts = [(x, prodSeq[y]) for x, y in tSeq] prodSeq = ts for nats, prod in prodSeq: try: Chem.SanitizeMol(prod) except: continue pSmi = Chem.MolToSmiles(prod, 1) if minFragmentSize > 0: nDummies = pSmi.count('*') if nats - nDummies < minFragmentSize: seqOk = False break # don't forget after replacing dummy atoms to remove any empty # branches: elif pSmi.replace('[*]', '').replace('()', '') in ('', 'C', 'CC', 'CCC'): seqOk = False break prod.pSmi = pSmi if seqOk: for nats, prod in prodSeq: pSmi = prod.pSmi #print '\t',nats,pSmi if not pSmi in allNodes: pNode = RecapHierarchyNode(prod) pNode.smiles = pSmi pNode.parents[nSmi] = weakref.proxy(node) node.children[pSmi] = pNode activePool[pSmi] = pNode allNodes[pSmi] = pNode else: pNode = allNodes[pSmi] pNode.parents[nSmi] = weakref.proxy(node) node.children[pSmi] = pNode #print ' >>an:',allNodes.keys() return res
def BRICSDecompose(mol,allNodes=None,minFragmentSize=1,onlyUseReactions=None, silent=True,keepNonLeafNodes=False,singlePass=False,returnMols=False): """ returns the BRICS decomposition for a molecule >>> from rdkit import Chem >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1') >>> res = list(BRICSDecompose(m)) >>> sorted(res) ['[14*]c1ccccn1', '[16*]c1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]'] >>> res = list(BRICSDecompose(m,returnMols=True)) >>> res[0] <rdkit.Chem.rdchem.Mol object ...> >>> smis = [Chem.MolToSmiles(x,True) for x in res] >>> sorted(smis) ['[14*]c1ccccn1', '[16*]c1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]'] nexavar, an example from the paper (corrected): >>> m = Chem.MolFromSmiles('CNC(=O)C1=NC=CC(OC2=CC=C(NC(=O)NC3=CC(=C(Cl)C=C3)C(F)(F)F)C=C2)=C1') >>> res = list(BRICSDecompose(m)) >>> sorted(res) ['[1*]C([1*])=O', '[1*]C([6*])=O', '[14*]c1cc([16*])ccn1', '[16*]c1ccc(Cl)c([16*])c1', '[16*]c1ccc([16*])cc1', '[3*]O[3*]', '[5*]NC', '[5*]N[5*]', '[8*]C(F)(F)F'] it's also possible to keep pieces that haven't been fully decomposed: >>> m = Chem.MolFromSmiles('CCCOCC') >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True)) >>> sorted(res) ['CCCOCC', '[3*]OCC', '[3*]OCCC', '[3*]O[3*]', '[4*]CC', '[4*]CCC'] >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1') >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True)) >>> sorted(res) ['CCCOCc1cccc(-c2ccccn2)c1', '[14*]c1ccccn1', '[16*]c1cccc(-c2ccccn2)c1', '[16*]c1cccc(COCCC)c1', '[16*]c1cccc([16*])c1', '[3*]OCCC', '[3*]OC[8*]', '[3*]OCc1cccc(-c2ccccn2)c1', '[3*]OCc1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]', '[4*]Cc1cccc(-c2ccccn2)c1', '[4*]Cc1cccc([16*])c1', '[8*]COCCC'] or to only do a single pass of decomposition: >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1') >>> res = list(BRICSDecompose(m,singlePass=True)) >>> sorted(res) ['CCCOCc1cccc(-c2ccccn2)c1', '[14*]c1ccccn1', '[16*]c1cccc(-c2ccccn2)c1', '[16*]c1cccc(COCCC)c1', '[3*]OCCC', '[3*]OCc1cccc(-c2ccccn2)c1', '[4*]CCC', '[4*]Cc1cccc(-c2ccccn2)c1', '[8*]COCCC'] setting a minimum size for the fragments: >>> m = Chem.MolFromSmiles('CCCOCC') >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True,minFragmentSize=2)) >>> sorted(res) ['CCCOCC', '[3*]OCC', '[3*]OCCC', '[4*]CC', '[4*]CCC'] >>> m = Chem.MolFromSmiles('CCCOCC') >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True,minFragmentSize=3)) >>> sorted(res) ['CCCOCC', '[3*]OCC', '[4*]CCC'] >>> res = list(BRICSDecompose(m,minFragmentSize=2)) >>> sorted(res) ['[3*]OCC', '[3*]OCCC', '[4*]CC', '[4*]CCC'] """ global reactions mSmi = Chem.MolToSmiles(mol,1) if allNodes is None: allNodes=set() if mSmi in allNodes: return set() activePool={mSmi:mol} allNodes.add(mSmi) foundMols={mSmi:mol} for gpIdx,reactionGp in enumerate(reactions): newPool = {} while activePool: matched=False nSmi = next(iterkeys(activePool)) mol = activePool.pop(nSmi) for rxnIdx,reaction in enumerate(reactionGp): if onlyUseReactions and (gpIdx,rxnIdx) not in onlyUseReactions: continue if not silent: print('--------') print(smartsGps[gpIdx][rxnIdx]) ps = reaction.RunReactants((mol,)) if ps: if not silent: print(nSmi,'->',len(ps),'products') for prodSeq in ps: seqOk=True # we want to disqualify small fragments, so sort the product sequence by size tSeq = [(prod.GetNumAtoms(onlyExplicit=True),idx) for idx,prod in enumerate(prodSeq)] tSeq.sort() for nats,idx in tSeq: prod = prodSeq[idx] try: Chem.SanitizeMol(prod) except: continue pSmi = Chem.MolToSmiles(prod,1) if minFragmentSize>0: nDummies = pSmi.count('*') if nats-nDummies<minFragmentSize: seqOk=False break prod.pSmi = pSmi ts = [(x,prodSeq[y]) for x,y in tSeq] prodSeq=ts if seqOk: matched=True for nats,prod in prodSeq: pSmi = prod.pSmi #print('\t',nats,pSmi) if pSmi not in allNodes: if not singlePass: activePool[pSmi] = prod allNodes.add(pSmi) foundMols[pSmi]=prod if singlePass or keepNonLeafNodes or not matched: newPool[nSmi]=mol activePool = newPool if not (singlePass or keepNonLeafNodes): if not returnMols: res = set(activePool.keys()) else: res = activePool.values() else: if not returnMols: res = allNodes else: res = foundMols.values() return res
def RecapDecompose(mol,allNodes=None,minFragmentSize=0,onlyUseReactions=None): """ returns the recap decomposition for a molecule """ mSmi = Chem.MolToSmiles(mol,1) if allNodes is None: allNodes={} if mSmi in allNodes: return allNodes[mSmi] res = RecapHierarchyNode(mol) res.smiles =mSmi activePool={mSmi:res} allNodes[mSmi]=res while activePool: nSmi = next(iterkeys(activePool)) node = activePool.pop(nSmi) if not node.mol: continue for rxnIdx,reaction in enumerate(reactions): if onlyUseReactions and rxnIdx not in onlyUseReactions: continue #print ' .',nSmi #print ' !!!!',rxnIdx,nSmi,reactionDefs[rxnIdx] ps = reaction.RunReactants((node.mol,)) #print ' ',len(ps) if ps: for prodSeq in ps: seqOk=True # we want to disqualify small fragments, so sort the product sequence by size # and then look for "forbidden" fragments tSeq = [(prod.GetNumAtoms(onlyExplicit=True),idx) for idx,prod in enumerate(prodSeq)] tSeq.sort() ts=[(x,prodSeq[y]) for x,y in tSeq] prodSeq=ts for nats,prod in prodSeq: try: Chem.SanitizeMol(prod) except: continue pSmi = Chem.MolToSmiles(prod,1) if minFragmentSize>0: nDummies = pSmi.count('*') if nats-nDummies<minFragmentSize: seqOk=False break # don't forget after replacing dummy atoms to remove any empty # branches: elif pSmi.replace('[*]','').replace('()','') in ('','C','CC','CCC'): seqOk=False break prod.pSmi = pSmi if seqOk: for nats,prod in prodSeq: pSmi = prod.pSmi #print '\t',nats,pSmi if not pSmi in allNodes: pNode = RecapHierarchyNode(prod) pNode.smiles=pSmi pNode.parents[nSmi]=weakref.proxy(node) node.children[pSmi]=pNode activePool[pSmi] = pNode allNodes[pSmi]=pNode else: pNode=allNodes[pSmi] pNode.parents[nSmi]=weakref.proxy(node) node.children[pSmi]=pNode #print ' >>an:',allNodes.keys() return res
def BRICSDecompose(mol, allNodes=None, minFragmentSize=1, onlyUseReactions=None, silent=True, keepNonLeafNodes=False, singlePass=False, returnMols=False): """ returns the BRICS decomposition for a molecule >>> from rdkit import Chem >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1') >>> res = list(BRICSDecompose(m)) >>> sorted(res) ['[14*]c1ccccn1', '[16*]c1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]'] >>> res = list(BRICSDecompose(m,returnMols=True)) >>> res[0] <rdkit.Chem.rdchem.Mol object ...> >>> smis = [Chem.MolToSmiles(x,True) for x in res] >>> sorted(smis) ['[14*]c1ccccn1', '[16*]c1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]'] nexavar, an example from the paper (corrected): >>> m = Chem.MolFromSmiles('CNC(=O)C1=NC=CC(OC2=CC=C(NC(=O)NC3=CC(=C(Cl)C=C3)C(F)(F)F)C=C2)=C1') >>> res = list(BRICSDecompose(m)) >>> sorted(res) ['[1*]C([1*])=O', '[1*]C([6*])=O', '[14*]c1cc([16*])ccn1', '[16*]c1ccc(Cl)c([16*])c1', '[16*]c1ccc([16*])cc1', '[3*]O[3*]', '[5*]NC', '[5*]N[5*]', '[8*]C(F)(F)F'] it's also possible to keep pieces that haven't been fully decomposed: >>> m = Chem.MolFromSmiles('CCCOCC') >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True)) >>> sorted(res) ['CCCOCC', '[3*]OCC', '[3*]OCCC', '[3*]O[3*]', '[4*]CC', '[4*]CCC'] >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1') >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True)) >>> sorted(res) ['CCCOCc1cccc(-c2ccccn2)c1', '[14*]c1ccccn1', '[16*]c1cccc(-c2ccccn2)c1', '[16*]c1cccc(COCCC)c1', '[16*]c1cccc([16*])c1', '[3*]OCCC', '[3*]OC[8*]', '[3*]OCc1cccc(-c2ccccn2)c1', '[3*]OCc1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]', '[4*]Cc1cccc(-c2ccccn2)c1', '[4*]Cc1cccc([16*])c1', '[8*]COCCC'] or to only do a single pass of decomposition: >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1') >>> res = list(BRICSDecompose(m,singlePass=True)) >>> sorted(res) ['CCCOCc1cccc(-c2ccccn2)c1', '[14*]c1ccccn1', '[16*]c1cccc(-c2ccccn2)c1', '[16*]c1cccc(COCCC)c1', '[3*]OCCC', '[3*]OCc1cccc(-c2ccccn2)c1', '[4*]CCC', '[4*]Cc1cccc(-c2ccccn2)c1', '[8*]COCCC'] setting a minimum size for the fragments: >>> m = Chem.MolFromSmiles('CCCOCC') >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True,minFragmentSize=2)) >>> sorted(res) ['CCCOCC', '[3*]OCC', '[3*]OCCC', '[4*]CC', '[4*]CCC'] >>> m = Chem.MolFromSmiles('CCCOCC') >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True,minFragmentSize=3)) >>> sorted(res) ['CCCOCC', '[3*]OCC', '[4*]CCC'] >>> res = list(BRICSDecompose(m,minFragmentSize=2)) >>> sorted(res) ['[3*]OCC', '[3*]OCCC', '[4*]CC', '[4*]CCC'] """ global reactions mSmi = Chem.MolToSmiles(mol, 1) if allNodes is None: allNodes = set() if mSmi in allNodes: return set() activePool = {mSmi: mol} allNodes.add(mSmi) foundMols = {mSmi: mol} for gpIdx, reactionGp in enumerate(reactions): newPool = {} while activePool: matched = False nSmi = next(iterkeys(activePool)) mol = activePool.pop(nSmi) for rxnIdx, reaction in enumerate(reactionGp): if onlyUseReactions and (gpIdx, rxnIdx) not in onlyUseReactions: continue if not silent: print('--------') print(smartsGps[gpIdx][rxnIdx]) ps = reaction.RunReactants((mol, )) if ps: if not silent: print(nSmi, '->', len(ps), 'products') for prodSeq in ps: seqOk = True # we want to disqualify small fragments, so sort the product sequence by size tSeq = [(prod.GetNumAtoms(onlyExplicit=True), idx) for idx, prod in enumerate(prodSeq)] tSeq.sort() for nats, idx in tSeq: prod = prodSeq[idx] try: Chem.SanitizeMol(prod) except Exception: continue pSmi = Chem.MolToSmiles(prod, 1) if minFragmentSize > 0: nDummies = pSmi.count('*') if nats - nDummies < minFragmentSize: seqOk = False break prod.pSmi = pSmi ts = [(x, prodSeq[y]) for x, y in tSeq] prodSeq = ts if seqOk: matched = True for nats, prod in prodSeq: pSmi = prod.pSmi #print('\t',nats,pSmi) if pSmi not in allNodes: if not singlePass: activePool[pSmi] = prod allNodes.add(pSmi) foundMols[pSmi] = prod if singlePass or keepNonLeafNodes or not matched: newPool[nSmi] = mol activePool = newPool if not (singlePass or keepNonLeafNodes): if not returnMols: res = set(activePool.keys()) else: res = activePool.values() else: if not returnMols: res = allNodes else: res = foundMols.values() return res