def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return 0 # extracted = DeepSMILESLanguageModelUtils.extract(generated) # tokenized = DeepSMILESTokenizer(extracted) # len_score = len(tokenized.get_tokens()) / (text_length - 1) # provide more reward for longer text sequences decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) logp = factor * MolLogP(mol) logp_score = (logp - logp_min) / (logp_max - logp_min ) # normalize logP between 0 and 1 score = logp_score # (logp_score * 0.5) + (len_score * 0.5) logger.info("%s, %s" % (generated, str(score))) return score
def GetNeighborLists(probes,topN,pool, simMetric=DataStructs.DiceSimilarity, silent=False): probeFps = [x[1] for x in probes] validProbes = [x for x in range(len(probeFps)) if probeFps[x] is not None] validFps=[probeFps[x] for x in validProbes] from rdkit.DataStructs.TopNContainer import TopNContainer nbrLists = [TopNContainer(topN) for x in range(len(probeFps))] nDone=0 for nm,fp in pool: nDone+=1 if not silent and not nDone%1000: logger.info(' searched %d rows'%nDone) if(simMetric==DataStructs.DiceSimilarity): scores = DataStructs.BulkDiceSimilarity(fp,validFps) for i,score in enumerate(scores): nbrLists[validProbes[i]].Insert(score,nm) elif(simMetric==DataStructs.TanimotoSimilarity): scores = DataStructs.BulkTanimotoSimilarity(fp,validFps) for i,score in enumerate(scores): nbrLists[validProbes[i]].Insert(score,nm) else: for i in range(len(probeFps)): pfp = probeFps[i] if pfp is not None: score = simMetric(probeFps[i],fp) nbrLists[i].Insert(score,nm) return nbrLists
def dividetask(data, task, silent=True): data = mpi.broadcast(mpi.world, data, 0) nProcs = mpi.world.size chunkSize = len(data) // nProcs extraBits = len(data) % nProcs res = [] allRes = [] # the root node handles the extra pieces: if mpi.world.rank == 0: for i in range(extraBits): elem = data[i] res.append(task(elem)) if not silent: logger.info('task(%d) done %d' % (mpi.world.rank, i + 1)) pos = extraBits + mpi.world.rank * chunkSize for i in range(chunkSize): elem = data[pos] pos += 1 res.append(task(elem)) if not silent: logger.info('task(%d) done %d' % (mpi.world.rank, i + 1)) if mpi.world.rank == 0: tmp = mpi.gather(mpi.world, res, 0) for res in tmp: allRes.extend(res) else: mpi.gather(mpi.world, res, 0) return allRes
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return 0 decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(smiles) num_atoms = mol.GetNumAtoms() num_aromatic_atoms = 0 for i in range(num_atoms): if mol.GetAtomWithIdx(i).GetIsAromatic(): num_aromatic_atoms += 1 arom_reward = num_aromatic_atoms / 23 perplexity = lm.perplexity(text) perplexity_reward = perplexity / (1 + perplexity) score = (perplexity_reward * 0.5) + (arom_reward * 0.5) logger.info("%s, %s" % (generated, str(score))) return score
def dividetask(data,task,silent=True): data=mpi.broadcast(mpi.world,data,0) nProcs = mpi.world.size chunkSize=len(data)//nProcs extraBits =len(data)%nProcs res=[] allRes=[] # the root node handles the extra pieces: if mpi.world.rank == 0: for i in range(extraBits): elem=data[i] res.append(task(elem)) if not silent: logger.info('task(%d) done %d'%(mpi.world.rank,i+1)) pos=extraBits+mpi.world.rank*chunkSize; for i in range(chunkSize): elem=data[pos] pos += 1 res.append(task(elem)) if not silent: logger.info('task(%d) done %d'%(mpi.world.rank,i+1)) if mpi.world.rank==0: tmp=mpi.gather(mpi.world,res,0) for res in tmp: allRes.extend(res) else: mpi.gather(mpi.world,res,0) return allRes
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return -1.0 jscore = jscorer.score(smiles) score = jscore / (1 + np.abs(jscore)) logger.info("%s, %s" % (generated, str(score))) return score
def GetNeighborLists(probes, topN, pool, simMetric=DataStructs.DiceSimilarity, simThresh=-1., silent=False, **kwargs): probeFps = [x[1] for x in probes] validProbes = [x for x in range(len(probeFps)) if probeFps[x] is not None] validFps = [probeFps[x] for x in validProbes] from rdkit.DataStructs.TopNContainer import TopNContainer if simThresh <= 0: nbrLists = [TopNContainer(topN) for x in range(len(probeFps))] else: nbrLists = [TopNContainer(-1) for x in range(len(probeFps))] nDone = 0 for nm, fp in pool: nDone += 1 if not silent and not nDone % 1000: logger.info(' searched %d rows' % nDone) if (simMetric == DataStructs.DiceSimilarity): scores = DataStructs.BulkDiceSimilarity(fp, validFps) for i, score in enumerate(scores): if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) elif (simMetric == DataStructs.TanimotoSimilarity): scores = DataStructs.BulkTanimotoSimilarity(fp, validFps) for i, score in enumerate(scores): if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) elif (simMetric == DataStructs.TverskySimilarity): av = float(kwargs.get('tverskyA', 0.5)) bv = float(kwargs.get('tverskyB', 0.5)) scores = DataStructs.BulkTverskySimilarity(fp, validFps, av, bv) for i, score in enumerate(scores): if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) else: for i in range(len(probeFps)): pfp = probeFps[i] if pfp is not None: score = simMetric(probeFps[i], fp) if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) return nbrLists
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return 0 extracted = DeepSMILESLanguageModelUtils.extract(generated, start='<s>', end='</s>') tokenized = DeepSMILESTokenizer(extracted) score = len(tokenized.get_tokens()) / ( text_length - 1) # provide more reward for longer text sequences logger.info("%s, %s" % (generated, str(score))) return score
def testCairoFile(self): try: from rdkit.Chem.Draw.cairoCanvas import Canvas except ImportError: logger.info("Skipping cairo test") return os.environ['RDKIT_CANVAS']='cairo' foo,fn=tempfile.mkstemp(suffix='.png') foo=None self.failUnlessEqual(os.path.getsize(fn),0) Draw.MolToFile(self.mol,fn) self.failIfEqual(os.path.getsize(fn),0) try: os.unlink(fn) except: pass
def testSpingFile(self): try: from rdkit.Chem.Draw.spingCanvas import Canvas except ImportError: logger.info("Skipping sping test") return os.environ['RDKIT_CANVAS'] = 'sping' foo, fn = tempfile.mkstemp(suffix='.png') foo = None self.assertEqual(os.path.getsize(fn), 0) Draw.MolToFile(self.mol, fn) self.assertNotEqual(os.path.getsize(fn), 0) try: os.unlink(fn) except Exception: pass
def GetNeighborLists(probes,topN,pool, simMetric=DataStructs.DiceSimilarity, simThresh=-1., silent=False, **kwargs): probeFps = [x[1] for x in probes] validProbes = [x for x in range(len(probeFps)) if probeFps[x] is not None] validFps=[probeFps[x] for x in validProbes] from rdkit.DataStructs.TopNContainer import TopNContainer if simThresh<=0: nbrLists = [TopNContainer(topN) for x in range(len(probeFps))] else: nbrLists=[TopNContainer(-1) for x in range(len(probeFps))] nDone=0 for nm,fp in pool: nDone+=1 if not silent and not nDone%1000: logger.info(' searched %d rows'%nDone) if(simMetric==DataStructs.DiceSimilarity): scores = DataStructs.BulkDiceSimilarity(fp,validFps) for i,score in enumerate(scores): if score>simThresh: nbrLists[validProbes[i]].Insert(score,nm) elif(simMetric==DataStructs.TanimotoSimilarity): scores = DataStructs.BulkTanimotoSimilarity(fp,validFps) for i,score in enumerate(scores): if score>simThresh: nbrLists[validProbes[i]].Insert(score,nm) elif(simMetric==DataStructs.TverskySimilarity): av = float(kwargs.get('tverskyA',0.5)) bv = float(kwargs.get('tverskyB',0.5)) scores = DataStructs.BulkTverskySimilarity(fp,validFps,av,bv) for i,score in enumerate(scores): if score>simThresh: nbrLists[validProbes[i]].Insert(score,nm) else: for i in range(len(probeFps)): pfp = probeFps[i] if pfp is not None: score = simMetric(probeFps[i],fp) if score>simThresh: nbrLists[validProbes[i]].Insert(score,nm) return nbrLists
def testSpingFile(self): try: from rdkit.Chem.Draw.spingCanvas import Canvas except ImportError: logger.info("Skipping sping test") return os.environ['RDKIT_CANVAS']='sping' foo,fn=tempfile.mkstemp(suffix='.png') foo=None self.assertEqual(os.path.getsize(fn),0) Draw.MolToFile(self.mol,fn) self.assertNotEqual(os.path.getsize(fn),0) try: os.unlink(fn) except Exception: pass
def testAggFile(self): try: from rdkit.Chem.Draw.aggCanvas import Canvas except ImportError: logger.info("Skipping agg test") return os.environ['RDKIT_CANVAS'] = 'agg' foo, fn = tempfile.mkstemp(suffix='.png') foo = None self.failUnlessEqual(os.path.getsize(fn), 0) Draw.MolToFile(self.mol, fn) self.failIfEqual(os.path.getsize(fn), 0) try: os.unlink(fn) except: pass
def testSpingFile(self): try: from rdkit.Chem.Draw.spingCanvas import Canvas except ImportError: logger.info("Skipping sping test") return os.environ["RDKIT_CANVAS"] = "sping" foo, fn = tempfile.mkstemp(suffix=".png") foo = None self.failUnlessEqual(os.path.getsize(fn), 0) Draw.MolToFile(self.mol, fn) self.failIfEqual(os.path.getsize(fn), 0) try: os.unlink(fn) except: pass
def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') smiles = DeepSMILESLanguageModelUtils.sanitize(decoded) except Exception: return -1.0 global all_smiles if smiles in all_smiles: score = -1.0 else: qedscore = qedscorer.score(smiles) score = qedscore / (1 + np.abs(qedscore)) all_smiles[smiles] = qedscore logger.info("%s, %s" % (smiles, str(score))) return score
parser.add_option('--maxPathLength','--max',default=8,type=int, help='maximum length path for the fingerprint') parser.add_option('--similarityThreshold','--sim',default=[0.9],type='floatlist', help='threshold for similarity') parser.add_option('--numNeighbors','--num','-n','-k',default=50,type=int, help='number of neighbors to consider') parser.add_option('--neighborsFile','--nbrs',default='', help='name of an output file to hold the neighbor lists') parser.add_option('--scan',default=False,action="store_true") if __name__=='__main__': options,args = parser.parse_args() outF = file(args[-1],'w+') logger.info('reading training molecules and generating fingerprints') suppl = Chem.SDMolSupplier(args[0]) train=[] for i,mol in enumerate(suppl): if not mol: continue smi = Chem.MolToSmiles(mol,True) nm = mol.GetProp(nameField) property = float(mol.GetProp(propField)) fp = GetMolFingerprint(mol,options.maxPathLength) train.append((nm,smi,fp,property)) logger.info(' got %d molecules'%len(train)) if len(args)>2: suppl = Chem.SDMolSupplier(args[1]) haveTest=True
"benchmark and test fingerprint screenout and substructure searching") parser.add_argument("--validate", dest='validateResults', default=False, action='store_true', help="validate that the screenout isn't missing anything") parser.add_argument("--short", dest='doShort', default=False, action='store_true', help="run a small subset of the molecules") args = parser.parse_args() ts = [] logger.info('mols from smiles') mols = [] t1 = time.time() # find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/chembl21_25K.pairs.txt.gz with gzip.open('../Data/chembl21_25K.pairs.txt.gz', 'rb') as inf: for line in inf: line = line.decode().strip().split() smi1 = line[1] smi2 = line[3] mols.append(Chem.MolFromSmiles(smi1)) mols.append(Chem.MolFromSmiles(smi2)) if args.doShort and len(mols) >= 1000: break t2 = time.time() ts.append(t2 - t1) logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds, {len(mols)} mols')
from rdkit.Chem import AllChem from rdkit.Chem import Recap from rdkit.RDLogger import logger logger = logger() tests=[1]*1001 if len(sys.argv)>1: tests=[0]*1001 tests[1]=1 for x in sys.argv[1:]: x = int(x) tests[x] = 1 ts = [] sdData = gzip.open('../Data/mols.1000.sdf.gz','rb').read() logger.info('mols from sdf') suppl = Chem.SDMolSupplier() suppl.SetData(sdData) mols = [] nMols=0 nBad=0 t1=time.time() for m in suppl: if m: nMols+=1 mols.append(m) else: nBad += 1 t2=time.time() logger.info('Results1: %.2f seconds, %d passed, %d failed'%(t2-t1,nMols,nBad)) ts.append(t2-t1)
# propField is the name of the property (from the SD file) you want to use # as the "activity" propField='chemical_shift_1' # similarity threshold for a pair to be considered interesting. # (i.e. pairs with a similiarity below this value will not be # added to the output. similarityThreshold=0.5 if __name__=='__main__': suppl = Chem.SDMolSupplier(sys.argv[1]) outF = file(sys.argv[2],'w+') data=[] logger.info('reading molecules and generating fingeprints') for i,mol in enumerate(suppl): if not mol: continue smi = Chem.MolToSmiles(mol,True) nm = mol.GetProp(nameField) property = float(mol.GetProp(propField)) fp = GetMolFingerprint(mol,maxPathLength) data.append((nm,smi,property,fp)) logger.info(' got %d molecules'%len(data)) logger.info('calculating pairs') pairs = [] for i in range(len(data)): for j in range(i+1,len(data)):
def CreateDb(options,dataFilename='',supplier=None): if not dataFilename and supplier is None: raise ValueError('Please provide either a data filename or a supplier') if options.errFilename: errFile=open(os.path.join(options.outDir,options.errFilename),'w+') else: errFile=None if options.noExtras: options.doPairs=False options.doDescriptors=False options.doFingerprints=False options.doPharm2D=False options.doGobbi2D=False options.doLayered=False options.doMorganFps=False if options.loadMols: if supplier is None: if not options.molFormat: ext = os.path.splitext(dataFilename)[-1].lower() if ext=='.sdf': options.molFormat='sdf' elif ext in ('.smi','.smiles','.txt','.csv'): options.molFormat='smiles' if not options.delimiter: # guess the delimiter import csv sniffer = csv.Sniffer() dlct=sniffer.sniff(open(dataFilename,'r').read(2000)) options.delimiter=dlct.delimiter if not options.silent: logger.info('Guessing that delimiter is %s. Use --delimiter argument if this is wrong.'%repr(options.delimiter)) if not options.silent: logger.info('Guessing that mol format is %s. Use --molFormat argument if this is wrong.'%repr(options.molFormat)) if options.molFormat=='smiles': if options.delimiter=='\\t': options.delimiter='\t' supplier=Chem.SmilesMolSupplier(dataFilename, titleLine=options.titleLine, delimiter=options.delimiter, smilesColumn=options.smilesColumn, nameColumn=options.nameColumn ) else: supplier = Chem.SDMolSupplier(dataFilename) if not options.silent: logger.info('Reading molecules and constructing molecular database.') Loader.LoadDb(supplier,os.path.join(options.outDir,options.molDbName), errorsTo=errFile,regName=options.regName,nameCol=options.molIdName, skipProps=options.skipProps,defaultVal=options.missingPropertyVal, addComputedProps=options.addProps,uniqNames=True, skipSmiles=options.skipSmiles,maxRowsCached=int(options.maxRowsCached), silent=options.silent,nameProp=options.nameProp, lazySupplier=int(options.maxRowsCached)>0, startAnew=not options.updateDb ) if options.doPairs: pairConn = DbConnect(os.path.join(options.outDir,options.pairDbName)) pairCurs = pairConn.GetCursor() try: pairCurs.execute('drop table %s'%(options.pairTableName)) except: pass pairCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,atompairfp blob,torsionfp blob)'%(options.pairTableName, options.molIdName)) if options.doFingerprints or options.doPharm2D or options.doGobbi2D or options.doLayered: fpConn = DbConnect(os.path.join(options.outDir,options.fpDbName)) fpCurs=fpConn.GetCursor() try: fpCurs.execute('drop table %s'%(options.fpTableName)) except: pass try: fpCurs.execute('drop table %s'%(options.pharm2DTableName)) except: pass try: fpCurs.execute('drop table %s'%(options.gobbi2DTableName)) except: pass try: fpCurs.execute('drop table %s'%(options.layeredTableName)) except: pass if options.doFingerprints: fpCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,rdkfp blob)'%(options.fpTableName, options.molIdName)) if options.doLayered: layeredQs = ','.join('?'*LayeredOptions.nWords) colDefs=','.join(['Col_%d integer'%(x+1) for x in range(LayeredOptions.nWords)]) fpCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,%s)'%(options.layeredTableName, options.molIdName, colDefs)) if options.doPharm2D: fpCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,pharm2dfp blob)'%(options.pharm2DTableName, options.molIdName)) sigFactory = BuildSigFactory(options) if options.doGobbi2D: fpCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,gobbi2dfp blob)'%(options.gobbi2DTableName, options.molIdName)) from rdkit.Chem.Pharm2D import Generate,Gobbi_Pharm2D if options.doMorganFps : fpConn = DbConnect(os.path.join(options.outDir,options.fpDbName)) fpCurs=fpConn.GetCursor() try: fpCurs.execute('drop table %s'%(options.morganFpTableName)) except: pass fpCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,morganfp blob)'%(options.morganFpTableName, options.molIdName)) if options.doDescriptors: descrConn=DbConnect(os.path.join(options.outDir,options.descrDbName)) with open(options.descriptorCalcFilename,'r') as inTF: buf = inTF.read().replace('\r\n', '\n').encode('utf-8') inTF.close() calc = cPickle.load(io.BytesIO(buf)) nms = [x for x in calc.GetDescriptorNames()] descrCurs = descrConn.GetCursor() descrs = ['guid integer not null primary key','%s varchar not null unique'%options.molIdName] descrs.extend(['%s float'%x for x in nms]) try: descrCurs.execute('drop table %s'%(options.descrTableName)) except: pass descrCurs.execute('create table %s (%s)'%(options.descrTableName,','.join(descrs))) descrQuery=','.join([DbModule.placeHolder]*len(descrs)) pairRows = [] fpRows = [] layeredRows = [] descrRows = [] pharm2DRows=[] gobbi2DRows=[] morganRows = [] if not options.silent: logger.info('Generating fingerprints and descriptors:') molConn = DbConnect(os.path.join(options.outDir,options.molDbName)) molCurs = molConn.GetCursor() if not options.skipSmiles: molCurs.execute('select guid,%s,smiles,molpkl from %s'%(options.molIdName,options.regName)) else: molCurs.execute('select guid,%s,molpkl from %s'%(options.molIdName,options.regName)) i=0 while 1: try: tpl = molCurs.fetchone() molGuid = tpl[0] molId = tpl[1] pkl = tpl[-1] i+=1 except: break if isinstance(pkl,(bytes,str)): mol = Chem.Mol(pkl) else: mol = Chem.Mol(str(pkl)) if not mol: continue if options.doPairs: pairs = FingerprintUtils.BuildAtomPairFP(mol) torsions = FingerprintUtils.BuildTorsionsFP(mol) pkl1 = DbModule.binaryHolder(pairs.ToBinary()) pkl2 = DbModule.binaryHolder(torsions.ToBinary()) row = (molGuid,molId,pkl1,pkl2) pairRows.append(row) if options.doFingerprints: fp2 = FingerprintUtils.BuildRDKitFP(mol) pkl = DbModule.binaryHolder(fp2.ToBinary()) row = (molGuid,molId,pkl) fpRows.append(row) if options.doLayered: words = LayeredOptions.GetWords(mol) row = [molGuid,molId]+words layeredRows.append(row) if options.doDescriptors: descrs= calc.CalcDescriptors(mol) row = [molGuid,molId] row.extend(descrs) descrRows.append(row) if options.doPharm2D: FingerprintUtils.sigFactory=sigFactory fp= FingerprintUtils.BuildPharm2DFP(mol) pkl = DbModule.binaryHolder(fp.ToBinary()) row = (molGuid,molId,pkl) pharm2DRows.append(row) if options.doGobbi2D: FingerprintUtils.sigFactory=Gobbi_Pharm2D.factory fp= FingerprintUtils.BuildPharm2DFP(mol) pkl = DbModule.binaryHolder(fp.ToBinary()) row = (molGuid,molId,pkl) gobbi2DRows.append(row) if options.doMorganFps: morgan = FingerprintUtils.BuildMorganFP(mol) pkl = DbModule.binaryHolder(morgan.ToBinary()) row = (molGuid,molId,pkl) morganRows.append(row) if not i%500: if len(pairRows): pairCurs.executemany('insert into %s values (?,?,?,?)'%options.pairTableName, pairRows) pairRows = [] pairConn.Commit() if len(fpRows): fpCurs.executemany('insert into %s values (?,?,?)'%options.fpTableName, fpRows) fpRows = [] fpConn.Commit() if len(layeredRows): fpCurs.executemany('insert into %s values (?,?,%s)'%(options.layeredTableName,layeredQs), layeredRows) layeredRows = [] fpConn.Commit() if len(descrRows): descrCurs.executemany('insert into %s values (%s)'%(options.descrTableName,descrQuery), descrRows) descrRows = [] descrConn.Commit() if len(pharm2DRows): fpCurs.executemany('insert into %s values (?,?,?)'%options.pharm2DTableName, pharm2DRows) pharm2DRows = [] fpConn.Commit() if len(gobbi2DRows): fpCurs.executemany('insert into %s values (?,?,?)'%options.gobbi2DTableName, gobbi2DRows) gobbi2DRows = [] fpConn.Commit() if len(morganRows): fpCurs.executemany('insert into %s values (?,?,?)'%options.morganFpTableName, morganRows) morganRows = [] fpConn.Commit() if not options.silent and not i%500: logger.info(' Done: %d'%(i)) if len(pairRows): pairCurs.executemany('insert into %s values (?,?,?,?)'%options.pairTableName, pairRows) pairRows = [] pairConn.Commit() if len(fpRows): fpCurs.executemany('insert into %s values (?,?,?)'%options.fpTableName, fpRows) fpRows = [] fpConn.Commit() if len(layeredRows): fpCurs.executemany('insert into %s values (?,?,%s)'%(options.layeredTableName,layeredQs), layeredRows) layeredRows = [] fpConn.Commit() if len(descrRows): descrCurs.executemany('insert into %s values (%s)'%(options.descrTableName,descrQuery), descrRows) descrRows = [] descrConn.Commit() if len(pharm2DRows): fpCurs.executemany('insert into %s values (?,?,?)'%options.pharm2DTableName, pharm2DRows) pharm2DRows = [] fpConn.Commit() if len(gobbi2DRows): fpCurs.executemany('insert into %s values (?,?,?)'%options.gobbi2DTableName, gobbi2DRows) gobbi2DRows = [] fpConn.Commit() if len(morganRows): fpCurs.executemany('insert into %s values (?,?,?)'%options.morganFpTableName, morganRows) morganRows = [] fpConn.Commit() if not options.silent: logger.info('Finished.')
from rdkit.Chem import AllChem from rdkit.Chem import Recap from rdkit.RDLogger import logger logger = logger() tests = [1] * 1001 if len(sys.argv) > 1: tests = [0] * 1001 tests[1] = 1 for x in sys.argv[1:]: x = int(x) tests[x] = 1 ts = [] sdData = gzip.open('../Data/mols.1000.sdf.gz', 'rb').read() logger.info('mols from sdf') suppl = Chem.SDMolSupplier() suppl.SetData(sdData) mols = [] nMols = 0 nBad = 0 t1 = time.time() for m in suppl: if m: nMols += 1 mols.append(m) else: nBad += 1 t2 = time.time() logger.info('Results1: %.2f seconds, %d passed, %d failed' % (t2 - t1, nMols, nBad))
lm = EmptyDeepSMILESLanguageModel(vocab, n=6) current_best_score = None current_best_smiles = None beats_current = lambda score: score < current_best_score for i in range(1000): generated = lm.generate(num_chars=25, text_seed="<s>") try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') sanitized = DeepSMILESLanguageModelUtils.sanitize(decoded) mol = Chem.MolFromSmiles(sanitized) logp_score = MolLogP(mol) logger.info("successful: %s , score: %s" % (sanitized, str(logp_score))) if current_best_score is None or beats_current(logp_score): current_best_score = logp_score current_best_smiles = sanitized except Exception as e: pass logger.info("best: %s , score: %s" % (current_best_smiles, str(current_best_score)))
from rdkit.Chem import AllChem from rdkit.Chem import Recap from rdkit.RDLogger import logger logger = logger() tests=[1]*1001 if len(sys.argv)>1: tests=[0]*1001 tests[1]=1 for x in sys.argv[1:]: x = int(x) tests[x] = 1 ts = [] mols = [] lines = gzip.open('../Data/znp.50k.smi.gz','rb').readlines() logger.info('mols from smiles') nMols=0 nBad=0 t1=time.time() for line in lines: line = line.strip().split(' ') m = Chem.MolFromSmiles(line[0]) if m: nMols+=1 mols.append(m) else: nBad += 1 t2=time.time() logger.info('Results1: %.2f seconds, %d passed, %d failed'%(t2-t1,nMols,nBad)) ts.append(t2-t1)
def RunSearch(options,queryFilename): global sigFactory if options.similarityType=='AtomPairs': fpBuilder=FingerprintUtils.BuildAtomPairFP simMetric=DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir,options.pairDbName) fpTableName = options.pairTableName fpColName = options.pairColName elif options.similarityType=='TopologicalTorsions': fpBuilder=FingerprintUtils.BuildTorsionsFP simMetric=DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir,options.torsionsDbName) fpTableName = options.torsionsTableName fpColName = options.torsionsColName elif options.similarityType=='RDK': fpBuilder=FingerprintUtils.BuildRDKitFP simMetric=DataStructs.FingerprintSimilarity dbName = os.path.join(options.dbDir,options.fpDbName) fpTableName = options.fpTableName if not options.fpColName: options.fpColName='rdkfp' fpColName = options.fpColName elif options.similarityType=='Pharm2D': fpBuilder=FingerprintUtils.BuildPharm2DFP simMetric=DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir,options.fpDbName) fpTableName = options.pharm2DTableName if not options.fpColName: options.fpColName='pharm2dfp' fpColName = options.fpColName FingerprintUtils.sigFactory = BuildSigFactory(options) elif options.similarityType=='Gobbi2D': from rdkit.Chem.Pharm2D import Gobbi_Pharm2D fpBuilder=FingerprintUtils.BuildPharm2DFP simMetric=DataStructs.TanimotoSimilarity dbName = os.path.join(options.dbDir,options.fpDbName) fpTableName = options.gobbi2DTableName if not options.fpColName: options.fpColName='gobbi2dfp' fpColName = options.fpColName FingerprintUtils.sigFactory = Gobbi_Pharm2D.factory elif options.similarityType=='Morgan': fpBuilder=FingerprintUtils.BuildMorganFP simMetric=DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir,options.morganFpDbName) fpTableName = options.morganFpTableName fpColName = options.morganFpColName extraArgs={} if options.similarityMetric=='tanimoto': simMetric = DataStructs.TanimotoSimilarity elif options.similarityMetric=='dice': simMetric = DataStructs.DiceSimilarity elif options.similarityMetric=='tversky': simMetric = DataStructs.TverskySimilarity extraArgs['tverskyA']=options.tverskyA extraArgs['tverskyB']=options.tverskyB if options.smilesQuery: mol=Chem.MolFromSmiles(options.smilesQuery) if not mol: logger.error('could not build query molecule from smiles "%s"'%options.smilesQuery) sys.exit(-1) options.queryMol = mol elif options.smartsQuery: mol=Chem.MolFromSmarts(options.smartsQuery) if not mol: logger.error('could not build query molecule from smarts "%s"'%options.smartsQuery) sys.exit(-1) options.queryMol = mol if options.outF=='-': outF=sys.stdout elif options.outF=='': outF=None else: outF = file(options.outF,'w+') molsOut=False if options.sdfOut: molsOut=True if options.sdfOut=='-': sdfOut=sys.stdout else: sdfOut = file(options.sdfOut,'w+') else: sdfOut=None if options.smilesOut: molsOut=True if options.smilesOut=='-': smilesOut=sys.stdout else: smilesOut = file(options.smilesOut,'w+') else: smilesOut=None if queryFilename: try: tmpF = file(queryFilename,'r') except IOError: logger.error('could not open query file %s'%queryFilename) sys.exit(1) if options.molFormat=='smiles': func=GetMolsFromSmilesFile elif options.molFormat=='sdf': func=GetMolsFromSDFile if not options.silent: msg='Reading query molecules' if fpBuilder: msg+=' and generating fingerprints' logger.info(msg) probes=[] i=0 nms=[] for nm,smi,mol in func(queryFilename,None,options.nameProp): i+=1 nms.append(nm) if not mol: logger.error('query molecule %d could not be built'%(i)) probes.append((None,None)) continue if fpBuilder: probes.append((mol,fpBuilder(mol))) else: probes.append((mol,None)) if not options.silent and not i%1000: logger.info(" done %d"%i) else: probes=None conn=None idName = options.molIdName ids=None names=None molDbName = os.path.join(options.dbDir,options.molDbName) molIdName = options.molIdName mConn = DbConnect(molDbName) cns = [(x.lower(),y) for x,y in mConn.GetColumnNamesAndTypes('molecules')] idCol,idTyp=cns[0] if options.propQuery or options.queryMol: conn = DbConnect(molDbName) curs = conn.GetCursor() if options.queryMol: if not options.silent: logger.info('Doing substructure query') if options.propQuery: where='where %s'%options.propQuery else: where='' if not options.silent: curs.execute('select count(*) from molecules %(where)s'%locals()) nToDo = curs.fetchone()[0] join='' doSubstructFPs=False fpDbName = os.path.join(options.dbDir,options.fpDbName) if os.path.exists(fpDbName) and not options.negateQuery : curs.execute("attach database '%s' as fpdb"%(fpDbName)) try: curs.execute('select * from fpdb.%s limit 1'%options.layeredTableName) except: pass else: doSubstructFPs=True join = 'join fpdb.%s using (%s)'%(options.layeredTableName,idCol) query = LayeredOptions.GetQueryText(options.queryMol) if query: if not where: where='where' else: where += ' and' where += ' '+query cmd = 'select %(idCol)s,molpkl from molecules %(join)s %(where)s'%locals() curs.execute(cmd) row=curs.fetchone() nDone=0 ids=[] while row: id,molpkl = row if not options.zipMols: m = Chem.Mol(str(molpkl)) else: m = Chem.Mol(zlib.decompress(str(molpkl))) matched=m.HasSubstructMatch(options.queryMol) if options.negateQuery: matched = not matched if matched: ids.append(id) nDone+=1 if not options.silent and not nDone%500: if not doSubstructFPs: logger.info(' searched %d (of %d) molecules; %d hits so far'%(nDone,nToDo,len(ids))) else: logger.info(' searched through %d molecules; %d hits so far'%(nDone,len(ids))) row=curs.fetchone() if not options.silent and doSubstructFPs and nToDo: nFiltered = nToDo-nDone logger.info(' Fingerprint screenout rate: %d of %d (%%%.2f)'%(nFiltered,nToDo,100.*nFiltered/nToDo)) elif options.propQuery: if not options.silent: logger.info('Doing property query') propQuery=options.propQuery.split(';')[0] curs.execute('select %(idCol)s from molecules where %(propQuery)s'%locals()) ids = [x[0] for x in curs.fetchall()] if not options.silent: logger.info('Found %d molecules matching the query'%(len(ids))) t1=time.time() if probes: if not options.silent: logger.info('Finding Neighbors') conn = DbConnect(dbName) cns = conn.GetColumnNames(fpTableName) curs = conn.GetCursor() if ids: ids = [(x,) for x in ids] curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)'%locals()) curs.executemany('insert into _tmpTbl values (?)',ids) join='join _tmpTbl using (%(idCol)s)'%locals() else: join='' if cns[0].lower() != idCol.lower(): # backwards compatibility to the days when mol tables had a guid and # the fps tables did not: curs.execute("attach database '%(molDbName)s' as mols"%locals()) curs.execute(""" select %(idCol)s,%(fpColName)s from %(fpTableName)s join (select %(idCol)s,%(molIdName)s from mols.molecules %(join)s) using (%(molIdName)s) """%(locals())) else: curs.execute('select %(idCol)s,%(fpColName)s from %(fpTableName)s %(join)s'%locals()) def poolFromCurs(curs,similarityMethod): row = curs.fetchone() while row: id,pkl = row fp = DepickleFP(str(pkl),similarityMethod) yield (id,fp) row = curs.fetchone() topNLists = GetNeighborLists(probes,options.topN,poolFromCurs(curs,options.similarityType), simMetric=simMetric,simThresh=options.simThresh,**extraArgs) uniqIds=set() nbrLists = {} for i,nm in enumerate(nms): topNLists[i].reverse() scores=topNLists[i].GetPts() nbrNames = topNLists[i].GetExtras() nbrs = [] for j,nbrGuid in enumerate(nbrNames): if nbrGuid is None: break else: uniqIds.add(nbrGuid) nbrs.append((nbrGuid,scores[j])) nbrLists[(i,nm)] = nbrs t2=time.time() if not options.silent: logger.info('The search took %.1f seconds'%(t2-t1)) if not options.silent: logger.info('Creating output') curs = mConn.GetCursor() ids = list(uniqIds) ids = [(x,) for x in ids] curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)'%locals()) curs.executemany('insert into _tmpTbl values (?)',ids) curs.execute('select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)'%locals()) nmDict={} for guid,id in curs.fetchall(): nmDict[guid]=str(id) ks = nbrLists.keys() ks.sort() if not options.transpose: for i,nm in ks: nbrs= nbrLists[(i,nm)] nbrTxt=options.outputDelim.join([nm]+['%s%s%.3f'%(nmDict[id],options.outputDelim,score) for id,score in nbrs]) if outF: print >>outF,nbrTxt else: labels = ['%s%sSimilarity'%(x[1],options.outputDelim) for x in ks] if outF: print >>outF,options.outputDelim.join(labels) for i in range(options.topN): outL = [] for idx,nm in ks: nbr = nbrLists[(idx,nm)][i] outL.append(nmDict[nbr[0]]) outL.append('%.3f'%nbr[1]) if outF: print >>outF,options.outputDelim.join(outL) else: if not options.silent: logger.info('Creating output') curs = mConn.GetCursor() ids = [(x,) for x in set(ids)] curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)'%locals()) curs.executemany('insert into _tmpTbl values (?)',ids) molIdName = options.molIdName curs.execute('select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)'%locals()) nmDict={} for guid,id in curs.fetchall(): nmDict[guid]=str(id) if outF: print >>outF,'\n'.join(nmDict.values()) if molsOut and ids: molDbName = os.path.join(options.dbDir,options.molDbName) cns = [x.lower() for x in mConn.GetColumnNames('molecules')] if cns[-1]!='molpkl': cns.remove('molpkl') cns.append('molpkl') curs = mConn.GetCursor() #curs.execute('create temporary table _tmpTbl (guid integer)'%locals()) #curs.executemany('insert into _tmpTbl values (?)',ids) cnText=','.join(cns) curs.execute('select %(cnText)s from molecules join _tmpTbl using (%(idCol)s)'%locals()) row=curs.fetchone() molD = {} while row: row = list(row) pkl = row[-1] m = Chem.Mol(str(pkl)) guid = row[0] nm = nmDict[guid] if sdfOut: m.SetProp('_Name',nm) print >>sdfOut,Chem.MolToMolBlock(m) for i in range(1,len(cns)-1): pn = cns[i] pv = str(row[i]) print >>sdfOut,'> <%s>\n%s\n'%(pn,pv) print >>sdfOut,'$$$$' if smilesOut: smi=Chem.MolToSmiles(m,options.chiralSmiles) if smilesOut: print >>smilesOut,'%s %s'%(smi,str(row[1])) row=curs.fetchone() if not options.silent: logger.info('Done!')
TYPE_CHECKER = copy.copy(Option.TYPE_CHECKER) TYPE_CHECKER["floatlist"] = check_floatlist parser = OptionParser("distance predict", version="%prog", option_class=MyOption) parser.add_option("--maxPathLength", "--max", default=8, type=int, help="maximum length path for the fingerprint") parser.add_option("--similarityThreshold", "--sim", default=[0.9], type="floatlist", help="threshold for similarity") parser.add_option("--numNeighbors", "--num", "-n", "-k", default=50, type=int, help="number of neighbors to consider") parser.add_option("--neighborsFile", "--nbrs", default="", help="name of an output file to hold the neighbor lists") parser.add_option("--scan", default=False, action="store_true") if __name__ == "__main__": options, args = parser.parse_args() outF = file(args[-1], "w+") logger.info("reading training molecules and generating fingerprints") suppl = Chem.SDMolSupplier(args[0]) train = [] for i, mol in enumerate(suppl): if not mol: continue smi = Chem.MolToSmiles(mol, True) nm = mol.GetProp(nameField) property = float(mol.GetProp(propField)) fp = GetMolFingerprint(mol, options.maxPathLength) train.append((nm, smi, fp, property)) logger.info(" got %d molecules" % len(train)) if len(args) > 2: suppl = Chem.SDMolSupplier(args[1]) haveTest = True
from rdkit.RDLogger import logger from chemgrams import get_arpa_vocab, KenLMDeepSMILESLanguageModel, DeepSMILESLanguageModelUtils, DeepSMILESTokenizer, \ LanguageModelMCTSWithUCB1 from rdkit import rdBase rdBase.DisableLog('rdApp.error') rdBase.DisableLog('rdApp.warning') logger = logger() if __name__ == '__main__': logger.info("loading language model...") vocab = get_arpa_vocab( '../resources/chemts_250k_deepsmiles_klm_10gram_200429.arpa') lm = KenLMDeepSMILESLanguageModel( '../resources/chemts_250k_deepsmiles_klm_10gram_200429.klm', vocab) num_simulations = 1000 width = 3 text_length = 25 start_state = ["<s>"] def eval_function(text): generated = ''.join(text) try: decoded = DeepSMILESLanguageModelUtils.decode(generated, start='<s>', end='</s>') DeepSMILESLanguageModelUtils.sanitize(decoded)
def CreateDb(options, dataFilename="", supplier=None): if not dataFilename and supplier is None: raise ValueError, "Please provide either a data filename or a supplier" if options.errFilename: errFile = file(os.path.join(options.outDir, options.errFilename), "w+") else: errFile = None if options.noExtras: options.doPairs = False options.doDescriptors = False options.doFingerprints = False options.doPharm2D = False options.doGobbi2D = False options.doLayered = False options.doMorganFps = False if options.loadMols: if supplier is None: if not options.molFormat: ext = os.path.splitext(dataFilename)[-1].lower() if ext == ".sdf": options.molFormat = "sdf" elif ext in (".smi", ".smiles", ".txt", ".csv"): options.molFormat = "smiles" if not options.delimiter: # guess the delimiter import csv sniffer = csv.Sniffer() dlct = sniffer.sniff(file(dataFilename, "r").read(2000)) options.delimiter = dlct.delimiter if not options.silent: logger.info( "Guessing that delimiter is %s. Use --delimiter argument if this is wrong." % repr(options.delimiter) ) if not options.silent: logger.info( "Guessing that mol format is %s. Use --molFormat argument if this is wrong." % repr(options.molFormat) ) if options.molFormat == "smiles": if options.delimiter == "\\t": options.delimiter = "\t" supplier = Chem.SmilesMolSupplier( dataFilename, titleLine=options.titleLine, delimiter=options.delimiter, smilesColumn=options.smilesColumn, nameColumn=options.nameColumn, ) else: supplier = Chem.SDMolSupplier(dataFilename) if not options.silent: logger.info("Reading molecules and constructing molecular database.") Loader.LoadDb( supplier, os.path.join(options.outDir, options.molDbName), errorsTo=errFile, regName=options.regName, nameCol=options.molIdName, skipProps=options.skipProps, defaultVal=options.missingPropertyVal, addComputedProps=options.addProps, uniqNames=True, skipSmiles=options.skipSmiles, maxRowsCached=int(options.maxRowsCached), silent=options.silent, nameProp=options.nameProp, lazySupplier=int(options.maxRowsCached) > 0, ) if options.doPairs: pairConn = DbConnect(os.path.join(options.outDir, options.pairDbName)) pairCurs = pairConn.GetCursor() try: pairCurs.execute("drop table %s" % (options.pairTableName)) except: pass pairCurs.execute( "create table %s (guid integer not null primary key,%s varchar not null unique,atompairfp blob,torsionfp blob)" % (options.pairTableName, options.molIdName) ) if options.doFingerprints or options.doPharm2D or options.doGobbi2D or options.doLayered: fpConn = DbConnect(os.path.join(options.outDir, options.fpDbName)) fpCurs = fpConn.GetCursor() try: fpCurs.execute("drop table %s" % (options.fpTableName)) except: pass try: fpCurs.execute("drop table %s" % (options.pharm2DTableName)) except: pass try: fpCurs.execute("drop table %s" % (options.gobbi2DTableName)) except: pass try: fpCurs.execute("drop table %s" % (options.layeredTableName)) except: pass if options.doFingerprints: fpCurs.execute( "create table %s (guid integer not null primary key,%s varchar not null unique,rdkfp blob)" % (options.fpTableName, options.molIdName) ) if options.doLayered: layeredQs = ",".join("?" * LayeredOptions.nWords) colDefs = ",".join(["Col_%d integer" % (x + 1) for x in range(LayeredOptions.nWords)]) fpCurs.execute( "create table %s (guid integer not null primary key,%s varchar not null unique,%s)" % (options.layeredTableName, options.molIdName, colDefs) ) if options.doPharm2D: fpCurs.execute( "create table %s (guid integer not null primary key,%s varchar not null unique,pharm2dfp blob)" % (options.pharm2DTableName, options.molIdName) ) sigFactory = BuildSigFactory(options) if options.doGobbi2D: fpCurs.execute( "create table %s (guid integer not null primary key,%s varchar not null unique,gobbi2dfp blob)" % (options.gobbi2DTableName, options.molIdName) ) from rdkit.Chem.Pharm2D import Generate, Gobbi_Pharm2D if options.doMorganFps: fpConn = DbConnect(os.path.join(options.outDir, options.fpDbName)) fpCurs = fpConn.GetCursor() try: fpCurs.execute("drop table %s" % (options.morganFpTableName)) except: pass fpCurs.execute( "create table %s (guid integer not null primary key,%s varchar not null unique,morganfp blob)" % (options.morganFpTableName, options.molIdName) ) if options.doDescriptors: descrConn = DbConnect(os.path.join(options.outDir, options.descrDbName)) calc = cPickle.load(file(options.descriptorCalcFilename, "rb")) nms = [x for x in calc.GetDescriptorNames()] descrCurs = descrConn.GetCursor() descrs = ["guid integer not null primary key", "%s varchar not null unique" % options.molIdName] descrs.extend(["%s float" % x for x in nms]) try: descrCurs.execute("drop table %s" % (options.descrTableName)) except: pass descrCurs.execute("create table %s (%s)" % (options.descrTableName, ",".join(descrs))) descrQuery = ",".join([DbModule.placeHolder] * len(descrs)) pairRows = [] fpRows = [] layeredRows = [] descrRows = [] pharm2DRows = [] gobbi2DRows = [] morganRows = [] if not options.silent: logger.info("Generating fingerprints and descriptors:") molConn = DbConnect(os.path.join(options.outDir, options.molDbName)) molCurs = molConn.GetCursor() if not options.skipSmiles: molCurs.execute("select guid,%s,smiles,molpkl from %s" % (options.molIdName, options.regName)) else: molCurs.execute("select guid,%s,molpkl from %s" % (options.molIdName, options.regName)) i = 0 while 1: try: tpl = molCurs.fetchone() molGuid = tpl[0] molId = tpl[1] pkl = tpl[-1] i += 1 except: break mol = Chem.Mol(str(pkl)) if not mol: continue if options.doPairs: pairs = FingerprintUtils.BuildAtomPairFP(mol) torsions = FingerprintUtils.BuildTorsionsFP(mol) pkl1 = DbModule.binaryHolder(pairs.ToBinary()) pkl2 = DbModule.binaryHolder(torsions.ToBinary()) row = (molGuid, molId, pkl1, pkl2) pairRows.append(row) if options.doFingerprints: fp2 = FingerprintUtils.BuildRDKitFP(mol) pkl = DbModule.binaryHolder(fp2.ToBinary()) row = (molGuid, molId, pkl) fpRows.append(row) if options.doLayered: words = LayeredOptions.GetWords(mol) row = [molGuid, molId] + words layeredRows.append(row) if options.doDescriptors: descrs = calc.CalcDescriptors(mol) row = [molGuid, molId] row.extend(descrs) descrRows.append(row) if options.doPharm2D: FingerprintUtils.sigFactory = sigFactory fp = FingerprintUtils.BuildPharm2DFP(mol) pkl = DbModule.binaryHolder(fp.ToBinary()) row = (molGuid, molId, pkl) pharm2DRows.append(row) if options.doGobbi2D: FingerprintUtils.sigFactory = Gobbi_Pharm2D.factory fp = FingerprintUtils.BuildPharm2DFP(mol) pkl = DbModule.binaryHolder(fp.ToBinary()) row = (molGuid, molId, pkl) gobbi2DRows.append(row) if options.doMorganFps: morgan = FingerprintUtils.BuildMorganFP(mol) pkl = DbModule.binaryHolder(morgan.ToBinary()) row = (molGuid, molId, pkl) morganRows.append(row) if not i % 500: if len(pairRows): pairCurs.executemany("insert into %s values (?,?,?,?)" % options.pairTableName, pairRows) pairRows = [] pairConn.Commit() if len(fpRows): fpCurs.executemany("insert into %s values (?,?,?)" % options.fpTableName, fpRows) fpRows = [] fpConn.Commit() if len(layeredRows): fpCurs.executemany( "insert into %s values (?,?,%s)" % (options.layeredTableName, layeredQs), layeredRows ) layeredRows = [] fpConn.Commit() if len(descrRows): descrCurs.executemany("insert into %s values (%s)" % (options.descrTableName, descrQuery), descrRows) descrRows = [] descrConn.Commit() if len(pharm2DRows): fpCurs.executemany("insert into %s values (?,?,?)" % options.pharm2DTableName, pharm2DRows) pharm2DRows = [] fpConn.Commit() if len(gobbi2DRows): fpCurs.executemany("insert into %s values (?,?,?)" % options.gobbi2DTableName, gobbi2DRows) gobbi2DRows = [] fpConn.Commit() if len(morganRows): fpCurs.executemany("insert into %s values (?,?,?)" % options.morganFpTableName, morganRows) morganRows = [] fpConn.Commit() if not options.silent and not i % 500: logger.info(" Done: %d" % (i)) if len(pairRows): pairCurs.executemany("insert into %s values (?,?,?,?)" % options.pairTableName, pairRows) pairRows = [] pairConn.Commit() if len(fpRows): fpCurs.executemany("insert into %s values (?,?,?)" % options.fpTableName, fpRows) fpRows = [] fpConn.Commit() if len(layeredRows): fpCurs.executemany("insert into %s values (?,?,%s)" % (options.layeredTableName, layeredQs), layeredRows) layeredRows = [] fpConn.Commit() if len(descrRows): descrCurs.executemany("insert into %s values (%s)" % (options.descrTableName, descrQuery), descrRows) descrRows = [] descrConn.Commit() if len(pharm2DRows): fpCurs.executemany("insert into %s values (?,?,?)" % options.pharm2DTableName, pharm2DRows) pharm2DRows = [] fpConn.Commit() if len(gobbi2DRows): fpCurs.executemany("insert into %s values (?,?,?)" % options.gobbi2DTableName, gobbi2DRows) gobbi2DRows = [] fpConn.Commit() if len(morganRows): fpCurs.executemany("insert into %s values (?,?,?)" % options.morganFpTableName, morganRows) morganRows = [] fpConn.Commit() if not options.silent: logger.info("Finished.")
from rdkit import Chem from rdkit import RDConfig import time, cPickle, sys, gzip from rdkit.RDLogger import logger logger = logger() logger.info('reading smarts') qs = [] smas = [] for line in file(RDConfig.RDDataDir + '/SmartsLib/RLewis_smarts.txt', 'r').readlines(): if line[0] == '#': continue line = line.split(' ') p = Chem.MolFromSmarts(line[0]) if not p: print >> sys.stderr, line[0] continue smas.append(line[0]) qs.append(p) logger.info('reading target counts') refFps = cPickle.loads(gzip.open('fps.1000.counts.pkl.gz', 'rb').read()) fps = [] logger.info('reading mols:') ms = cPickle.loads(gzip.open('mols.1000.pkl.gz', 'rb').read()) t1 = time.time() nFail = 0 for i, m in enumerate(ms): fp = [0] * len(qs)
from rdkit import Chem from rdkit import RDConfig import time,cPickle,sys,gzip from rdkit.RDLogger import logger logger = logger() logger.info('reading smarts') qs = [] smas = [] for line in file(RDConfig.RDDataDir+'/SmartsLib/RLewis_smarts.txt','r').readlines(): if line[0] == '#': continue line = line.split(' ') p = Chem.MolFromSmarts(line[0]) if not p: print >>sys.stderr,line[0] continue smas.append(line[0]) qs.append(p) logger.info('reading target counts') refFps = cPickle.loads(gzip.open('fps.1000.counts.pkl.gz','rb').read()) fps = [] logger.info('reading mols:') ms = cPickle.loads(gzip.open('mols.1000.pkl.gz','rb').read()) t1 = time.time() nFail=0 for i,m in enumerate(ms): fp = [0]*len(qs) for j,q in enumerate(qs):
# propField is the name of the property (from the SD file) you want to use # as the "activity" propField = 'chemical_shift_1' # similarity threshold for a pair to be considered interesting. # (i.e. pairs with a similarity below this value will not be # added to the output. similarityThreshold = 0.5 if __name__ == '__main__': suppl = Chem.SDMolSupplier(sys.argv[1]) outF = file(sys.argv[2], 'w+') data = [] logger.info('reading molecules and generating fingeprints') for i, mol in enumerate(suppl): if not mol: continue smi = Chem.MolToSmiles(mol, True) nm = mol.GetProp(nameField) property = float(mol.GetProp(propField)) fp = GetMolFingerprint(mol, maxPathLength) data.append((nm, smi, property, fp)) logger.info(' got %d molecules' % len(data)) logger.info('calculating pairs') pairs = [] for i in range(len(data)): for j in range(i + 1, len(data)):
from chemgrams import get_arpa_vocab, KenLMDeepSMILESLanguageModel, DeepSMILESLanguageModelUtils, \ LanguageModelMCTSWithPUCTTerminating from chemgrams.qedscorer import QEDScorer from rdkit.RDLogger import logger from rdkit import rdBase rdBase.DisableLog('rdApp.error') rdBase.DisableLog('rdApp.warning') logger = logger() THIS_DIR = os.path.dirname(os.path.abspath(__file__)) if __name__ == '__main__': logger.info("loading language model...") vocab = get_arpa_vocab( '../resources/chemts_250k_deepsmiles_klm_6gram_190414.arpa') lm = KenLMDeepSMILESLanguageModel( '../resources/chemts_250k_deepsmiles_klm_6gram_190414.klm', vocab) num_simulations = 100000 width = 24 max_depth = 100 start_state = ["<s>"] c = 5 qedscorer = QEDScorer() all_smiles = {}
from rdkit import Chem from rdkit.Chem import AllChem from rdkit.RDLogger import logger logger = logger() tests = [1] * 1001 if len(sys.argv) > 1: tests = [0] * 1001 tests[1] = 1 for x in sys.argv[1:]: x = int(x) tests[x] = 1 ts = [] mols = [] lines = gzip.open('../Data/znp.50k.smi.gz', 'rt').readlines() logger.info('mols from smiles') nMols = 0 nBad = 0 t1 = time.time() for line in lines: line = line.strip().split(' ') m = Chem.MolFromSmiles(line[0]) if m: nMols += 1 mols.append(m) else: nBad += 1 t2 = time.time() logger.info('Results1: %.2f seconds, %d passed, %d failed' % (t2 - t1, nMols, nBad)) ts.append(t2 - t1)
def RunSearch(options, queryFilename): global sigFactory if options.similarityType == 'AtomPairs': fpBuilder = FingerprintUtils.BuildAtomPairFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.pairDbName) fpTableName = options.pairTableName fpColName = options.pairColName elif options.similarityType == 'TopologicalTorsions': fpBuilder = FingerprintUtils.BuildTorsionsFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.torsionsDbName) fpTableName = options.torsionsTableName fpColName = options.torsionsColName elif options.similarityType == 'RDK': fpBuilder = FingerprintUtils.BuildRDKitFP simMetric = DataStructs.FingerprintSimilarity dbName = os.path.join(options.dbDir, options.fpDbName) fpTableName = options.fpTableName if not options.fpColName: options.fpColName = 'rdkfp' fpColName = options.fpColName elif options.similarityType == 'Pharm2D': fpBuilder = FingerprintUtils.BuildPharm2DFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.fpDbName) fpTableName = options.pharm2DTableName if not options.fpColName: options.fpColName = 'pharm2dfp' fpColName = options.fpColName FingerprintUtils.sigFactory = BuildSigFactory(options) elif options.similarityType == 'Gobbi2D': from rdkit.Chem.Pharm2D import Gobbi_Pharm2D fpBuilder = FingerprintUtils.BuildPharm2DFP simMetric = DataStructs.TanimotoSimilarity dbName = os.path.join(options.dbDir, options.fpDbName) fpTableName = options.gobbi2DTableName if not options.fpColName: options.fpColName = 'gobbi2dfp' fpColName = options.fpColName FingerprintUtils.sigFactory = Gobbi_Pharm2D.factory elif options.similarityType == 'Morgan': fpBuilder = FingerprintUtils.BuildMorganFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.morganFpDbName) fpTableName = options.morganFpTableName fpColName = options.morganFpColName extraArgs = {} if options.similarityMetric == 'tanimoto': simMetric = DataStructs.TanimotoSimilarity elif options.similarityMetric == 'dice': simMetric = DataStructs.DiceSimilarity elif options.similarityMetric == 'tversky': simMetric = DataStructs.TverskySimilarity extraArgs['tverskyA'] = options.tverskyA extraArgs['tverskyB'] = options.tverskyB if options.smilesQuery: mol = Chem.MolFromSmiles(options.smilesQuery) if not mol: logger.error('could not build query molecule from smiles "%s"' % options.smilesQuery) sys.exit(-1) options.queryMol = mol elif options.smartsQuery: mol = Chem.MolFromSmarts(options.smartsQuery) if not mol: logger.error('could not build query molecule from smarts "%s"' % options.smartsQuery) sys.exit(-1) options.queryMol = mol if options.outF == '-': outF = sys.stdout elif options.outF == '': outF = None else: outF = open(options.outF, 'w+') molsOut = False if options.sdfOut: molsOut = True if options.sdfOut == '-': sdfOut = sys.stdout else: sdfOut = open(options.sdfOut, 'w+') else: sdfOut = None if options.smilesOut: molsOut = True if options.smilesOut == '-': smilesOut = sys.stdout else: smilesOut = open(options.smilesOut, 'w+') else: smilesOut = None if queryFilename: try: tmpF = open(queryFilename, 'r') except IOError: logger.error('could not open query file %s' % queryFilename) sys.exit(1) if options.molFormat == 'smiles': func = GetMolsFromSmilesFile elif options.molFormat == 'sdf': func = GetMolsFromSDFile if not options.silent: msg = 'Reading query molecules' if fpBuilder: msg += ' and generating fingerprints' logger.info(msg) probes = [] i = 0 nms = [] for nm, smi, mol in func(queryFilename, None, options.nameProp): i += 1 nms.append(nm) if not mol: logger.error('query molecule %d could not be built' % (i)) probes.append((None, None)) continue if fpBuilder: probes.append((mol, fpBuilder(mol))) else: probes.append((mol, None)) if not options.silent and not i % 1000: logger.info(" done %d" % i) else: probes = None conn = None idName = options.molIdName ids = None names = None molDbName = os.path.join(options.dbDir, options.molDbName) molIdName = options.molIdName mConn = DbConnect(molDbName) cns = [(x.lower(), y) for x, y in mConn.GetColumnNamesAndTypes('molecules')] idCol, idTyp = cns[0] if options.propQuery or options.queryMol: conn = DbConnect(molDbName) curs = conn.GetCursor() if options.queryMol: if not options.silent: logger.info('Doing substructure query') if options.propQuery: where = 'where %s' % options.propQuery else: where = '' if not options.silent: curs.execute('select count(*) from molecules %(where)s' % locals()) nToDo = curs.fetchone()[0] join = '' doSubstructFPs = False fpDbName = os.path.join(options.dbDir, options.fpDbName) if os.path.exists(fpDbName) and not options.negateQuery: curs.execute("attach database '%s' as fpdb" % (fpDbName)) try: curs.execute('select * from fpdb.%s limit 1' % options.layeredTableName) except: pass else: doSubstructFPs = True join = 'join fpdb.%s using (%s)' % ( options.layeredTableName, idCol) query = LayeredOptions.GetQueryText(options.queryMol) if query: if not where: where = 'where' else: where += ' and' where += ' ' + query cmd = 'select %(idCol)s,molpkl from molecules %(join)s %(where)s' % locals( ) curs.execute(cmd) row = curs.fetchone() nDone = 0 ids = [] while row: id, molpkl = row if not options.zipMols: m = _molFromPkl(molpkl) else: m = Chem.Mol(zlib.decompress(molpkl)) matched = m.HasSubstructMatch(options.queryMol) if options.negateQuery: matched = not matched if matched: ids.append(id) nDone += 1 if not options.silent and not nDone % 500: if not doSubstructFPs: logger.info( ' searched %d (of %d) molecules; %d hits so far' % (nDone, nToDo, len(ids))) else: logger.info( ' searched through %d molecules; %d hits so far' % (nDone, len(ids))) row = curs.fetchone() if not options.silent and doSubstructFPs and nToDo: nFiltered = nToDo - nDone logger.info( ' Fingerprint screenout rate: %d of %d (%%%.2f)' % (nFiltered, nToDo, 100. * nFiltered / nToDo)) elif options.propQuery: if not options.silent: logger.info('Doing property query') propQuery = options.propQuery.split(';')[0] curs.execute( 'select %(idCol)s from molecules where %(propQuery)s' % locals()) ids = [x[0] for x in curs.fetchall()] if not options.silent: logger.info('Found %d molecules matching the query' % (len(ids))) t1 = time.time() if probes: if not options.silent: logger.info('Finding Neighbors') conn = DbConnect(dbName) cns = conn.GetColumnNames(fpTableName) curs = conn.GetCursor() if ids: ids = [(x, ) for x in ids] curs.execute( 'create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' % locals()) curs.executemany('insert into _tmpTbl values (?)', ids) join = 'join _tmpTbl using (%(idCol)s)' % locals() else: join = '' if cns[0].lower() != idCol.lower(): # backwards compatibility to the days when mol tables had a guid and # the fps tables did not: curs.execute("attach database '%(molDbName)s' as mols" % locals()) curs.execute(""" select %(idCol)s,%(fpColName)s from %(fpTableName)s join (select %(idCol)s,%(molIdName)s from mols.molecules %(join)s) using (%(molIdName)s) """ % (locals())) else: curs.execute( 'select %(idCol)s,%(fpColName)s from %(fpTableName)s %(join)s' % locals()) def poolFromCurs(curs, similarityMethod): row = curs.fetchone() while row: id, pkl = row fp = DepickleFP(pkl, similarityMethod) yield (id, fp) row = curs.fetchone() topNLists = GetNeighborLists(probes, options.topN, poolFromCurs(curs, options.similarityType), simMetric=simMetric, simThresh=options.simThresh, **extraArgs) uniqIds = set() nbrLists = {} for i, nm in enumerate(nms): topNLists[i].reverse() scores = topNLists[i].GetPts() nbrNames = topNLists[i].GetExtras() nbrs = [] for j, nbrGuid in enumerate(nbrNames): if nbrGuid is None: break else: uniqIds.add(nbrGuid) nbrs.append((nbrGuid, scores[j])) nbrLists[(i, nm)] = nbrs t2 = time.time() if not options.silent: logger.info('The search took %.1f seconds' % (t2 - t1)) if not options.silent: logger.info('Creating output') curs = mConn.GetCursor() ids = list(uniqIds) ids = [(x, ) for x in ids] curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' % locals()) curs.executemany('insert into _tmpTbl values (?)', ids) curs.execute( 'select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)' % locals()) nmDict = {} for guid, id in curs.fetchall(): nmDict[guid] = str(id) ks = list(nbrLists.keys()) ks.sort() if not options.transpose: for i, nm in ks: nbrs = nbrLists[(i, nm)] nbrTxt = options.outputDelim.join([nm] + [ '%s%s%.3f' % (nmDict[id], options.outputDelim, score) for id, score in nbrs ]) if outF: print(nbrTxt, file=outF) else: labels = [ '%s%sSimilarity' % (x[1], options.outputDelim) for x in ks ] if outF: print(options.outputDelim.join(labels), file=outF) for i in range(options.topN): outL = [] for idx, nm in ks: nbr = nbrLists[(idx, nm)][i] outL.append(nmDict[nbr[0]]) outL.append('%.3f' % nbr[1]) if outF: print(options.outputDelim.join(outL), file=outF) else: if not options.silent: logger.info('Creating output') curs = mConn.GetCursor() ids = [(x, ) for x in set(ids)] curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' % locals()) curs.executemany('insert into _tmpTbl values (?)', ids) molIdName = options.molIdName curs.execute( 'select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)' % locals()) nmDict = {} for guid, id in curs.fetchall(): nmDict[guid] = str(id) if outF: print('\n'.join(nmDict.values()), file=outF) if molsOut and ids: molDbName = os.path.join(options.dbDir, options.molDbName) cns = [x.lower() for x in mConn.GetColumnNames('molecules')] if cns[-1] != 'molpkl': cns.remove('molpkl') cns.append('molpkl') curs = mConn.GetCursor() #curs.execute('create temporary table _tmpTbl (guid integer)'%locals()) #curs.executemany('insert into _tmpTbl values (?)',ids) cnText = ','.join(cns) curs.execute( 'select %(cnText)s from molecules join _tmpTbl using (%(idCol)s)' % locals()) row = curs.fetchone() molD = {} while row: row = list(row) m = _molFromPkl(row[-1]) guid = row[0] nm = nmDict[guid] if sdfOut: m.SetProp('_Name', nm) print(Chem.MolToMolBlock(m), file=sdfOut) for i in range(1, len(cns) - 1): pn = cns[i] pv = str(row[i]) print >> sdfOut, '> <%s>\n%s\n' % (pn, pv) print('$$$$', file=sdfOut) if smilesOut: smi = Chem.MolToSmiles(m, options.chiralSmiles) if smilesOut: print('%s %s' % (smi, str(row[1])), file=smilesOut) row = curs.fetchone() if not options.silent: logger.info('Done!')
from rdkit.Chem import AllChem from rdkit.RDLogger import logger logger = logger() tests = [1] * 1001 if len(sys.argv) > 1: tests = [0] * 1001 tests[1] = 1 for x in sys.argv[1:]: x = int(x) tests[x] = 1 ts = [] mols = [] lines = gzip.open("../Data/znp.50k.smi.gz", "rb").readlines() logger.info("mols from smiles") nMols = 0 nBad = 0 t1 = time.time() for line in lines: line = line.strip().split(" ") m = Chem.MolFromSmiles(line[0]) if m: nMols += 1 mols.append(m) else: nBad += 1 t2 = time.time() logger.info("Results1: %.2f seconds, %d passed, %d failed" % (t2 - t1, nMols, nBad)) ts.append(t2 - t1)
parser = OptionParser("distance predict", version='%prog', option_class=MyOption) parser.add_option('--maxPathLength', '--max', default=8, type=int, help='maximum length path for the fingerprint') parser.add_option('--similarityThreshold', '--sim', default=[0.9], type='floatlist', help='threshold for similarity') parser.add_option('--numNeighbors', '--num', '-n', '-k', default=50, type=int, help='number of neighbors to consider') parser.add_option('--neighborsFile', '--nbrs', default='', help='name of an output file to hold the neighbor lists') parser.add_option('--scan', default=False, action="store_true") if __name__ == '__main__': options, args = parser.parse_args() outF = file(args[-1], 'w+') logger.info('reading training molecules and generating fingerprints') suppl = Chem.SDMolSupplier(args[0]) train = [] for i, mol in enumerate(suppl): if not mol: continue smi = Chem.MolToSmiles(mol, True) nm = mol.GetProp(nameField) property = float(mol.GetProp(propField)) fp = GetMolFingerprint(mol, options.maxPathLength) train.append((nm, smi, fp, property)) logger.info(' got %d molecules' % len(train)) if len(args) > 2: suppl = Chem.SDMolSupplier(args[1]) haveTest = True
logger = logger() tests = [1] * 1001 if len(sys.argv) > 1: tests = [0] * 1001 for x in sys.argv[1:]: x = int(x) tests[x] = 1 ts = [] mols = [] if tests[0]: lines = gzip.open(data('znp.50k.smi.gz'), 'rt').readlines() logger.info('mols from smiles') nMols = 0 nBad = 0 t1 = time.time() for line in lines: line = line.strip().split(' ') m = Chem.MolFromSmiles(line[0]) if m: nMols += 1 mols.append(m) else: nBad += 1 t2 = time.time() logger.info('Results1: %.2f seconds, %d passed, %d failed' % (t2 - t1, nMols, nBad))
from rdkit.Chem import Recap from rdkit.RDLogger import logger logger = logger() tests = [1] * 1001 if len(sys.argv) > 1: tests = [0] * 1001 tests[1] = 1 for x in sys.argv[1:]: x = int(x) tests[x] = 1 ts = [] sdData = gzip.open("../Data/mols.1000.sdf.gz").read() logger.info("mols from sdf") suppl = Chem.SDMolSupplier() suppl.SetData(sdData) mols = [] nMols = 0 nBad = 0 t1 = time.time() for m in suppl: if m: nMols += 1 mols.append(m) else: nBad += 1 t2 = time.time() logger.info("Results1: %.2f seconds, %d passed, %d failed" % (t2 - t1, nMols, nBad)) ts.append(t2 - t1)