Example #1
0
    def eval_function(text):
        generated = ''.join(text)
        try:
            decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                          start='<s>',
                                                          end='</s>')
            DeepSMILESLanguageModelUtils.sanitize(decoded)
        except Exception:
            return 0

        # extracted = DeepSMILESLanguageModelUtils.extract(generated)
        # tokenized = DeepSMILESTokenizer(extracted)
        # len_score = len(tokenized.get_tokens()) / (text_length - 1)  # provide more reward for longer text sequences

        decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                      start='<s>',
                                                      end='</s>')
        smiles = DeepSMILESLanguageModelUtils.sanitize(decoded)
        mol = Chem.MolFromSmiles(smiles)
        logp = factor * MolLogP(mol)
        logp_score = (logp - logp_min) / (logp_max - logp_min
                                          )  # normalize logP between 0 and 1

        score = logp_score  # (logp_score * 0.5) + (len_score * 0.5)

        logger.info("%s, %s" % (generated, str(score)))
        return score
Example #2
0
def GetNeighborLists(probes,topN,pool,
                     simMetric=DataStructs.DiceSimilarity,
                     silent=False):
  probeFps = [x[1] for x in probes]
  validProbes = [x for x in range(len(probeFps)) if probeFps[x] is not None]
  validFps=[probeFps[x] for x in validProbes]
  from rdkit.DataStructs.TopNContainer import TopNContainer
  nbrLists = [TopNContainer(topN) for x in range(len(probeFps))]

  nDone=0
  for nm,fp in pool:
    nDone+=1
    if not silent and not nDone%1000: logger.info('  searched %d rows'%nDone)
    if(simMetric==DataStructs.DiceSimilarity):
      scores = DataStructs.BulkDiceSimilarity(fp,validFps)
      for i,score in enumerate(scores):
        nbrLists[validProbes[i]].Insert(score,nm)
    elif(simMetric==DataStructs.TanimotoSimilarity):
      scores = DataStructs.BulkTanimotoSimilarity(fp,validFps)
      for i,score in enumerate(scores):
        nbrLists[validProbes[i]].Insert(score,nm)
    else:
      for i in range(len(probeFps)):
        pfp = probeFps[i]
        if pfp is not None:
          score = simMetric(probeFps[i],fp)
          nbrLists[i].Insert(score,nm)
  return nbrLists
Example #3
0
def dividetask(data, task, silent=True):
    data = mpi.broadcast(mpi.world, data, 0)

    nProcs = mpi.world.size
    chunkSize = len(data) // nProcs
    extraBits = len(data) % nProcs

    res = []
    allRes = []
    # the root node handles the extra pieces:
    if mpi.world.rank == 0:
        for i in range(extraBits):
            elem = data[i]
            res.append(task(elem))
            if not silent:
                logger.info('task(%d) done %d' % (mpi.world.rank, i + 1))
    pos = extraBits + mpi.world.rank * chunkSize
    for i in range(chunkSize):
        elem = data[pos]
        pos += 1
        res.append(task(elem))
        if not silent:
            logger.info('task(%d) done %d' % (mpi.world.rank, i + 1))
    if mpi.world.rank == 0:
        tmp = mpi.gather(mpi.world, res, 0)
        for res in tmp:
            allRes.extend(res)
    else:
        mpi.gather(mpi.world, res, 0)
    return allRes
    def eval_function(text):
        generated = ''.join(text)
        try:
            decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                          start='<s>',
                                                          end='</s>')
            DeepSMILESLanguageModelUtils.sanitize(decoded)
        except Exception:
            return 0
        decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                      start='<s>',
                                                      end='</s>')
        smiles = DeepSMILESLanguageModelUtils.sanitize(decoded)
        mol = Chem.MolFromSmiles(smiles)
        num_atoms = mol.GetNumAtoms()
        num_aromatic_atoms = 0
        for i in range(num_atoms):
            if mol.GetAtomWithIdx(i).GetIsAromatic():
                num_aromatic_atoms += 1
        arom_reward = num_aromatic_atoms / 23

        perplexity = lm.perplexity(text)
        perplexity_reward = perplexity / (1 + perplexity)

        score = (perplexity_reward * 0.5) + (arom_reward * 0.5)

        logger.info("%s, %s" % (generated, str(score)))
        return score
Example #5
0
def dividetask(data,task,silent=True):
    data=mpi.broadcast(mpi.world,data,0)

    nProcs = mpi.world.size
    chunkSize=len(data)//nProcs
    extraBits =len(data)%nProcs

    res=[]
    allRes=[]
    # the root node handles the extra pieces:
    if mpi.world.rank == 0:
        for i in range(extraBits):
          elem=data[i]
          res.append(task(elem))
          if not silent:
              logger.info('task(%d) done %d'%(mpi.world.rank,i+1))
    pos=extraBits+mpi.world.rank*chunkSize;
    for i in range(chunkSize):
        elem=data[pos]
        pos += 1
        res.append(task(elem))
        if not silent:
            logger.info('task(%d) done %d'%(mpi.world.rank,i+1))
    if mpi.world.rank==0:
        tmp=mpi.gather(mpi.world,res,0)
        for res in tmp: allRes.extend(res)
    else:
        mpi.gather(mpi.world,res,0)
    return allRes
Example #6
0
    def eval_function(text):
        generated = ''.join(text)
        try:
            decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                          start='<s>',
                                                          end='</s>')
            smiles = DeepSMILESLanguageModelUtils.sanitize(decoded)
        except Exception:
            return -1.0

        jscore = jscorer.score(smiles)
        score = jscore / (1 + np.abs(jscore))

        logger.info("%s, %s" % (generated, str(score)))
        return score
Example #7
0
def GetNeighborLists(probes,
                     topN,
                     pool,
                     simMetric=DataStructs.DiceSimilarity,
                     simThresh=-1.,
                     silent=False,
                     **kwargs):
    probeFps = [x[1] for x in probes]
    validProbes = [x for x in range(len(probeFps)) if probeFps[x] is not None]
    validFps = [probeFps[x] for x in validProbes]
    from rdkit.DataStructs.TopNContainer import TopNContainer
    if simThresh <= 0:
        nbrLists = [TopNContainer(topN) for x in range(len(probeFps))]
    else:
        nbrLists = [TopNContainer(-1) for x in range(len(probeFps))]

    nDone = 0
    for nm, fp in pool:
        nDone += 1
        if not silent and not nDone % 1000:
            logger.info('  searched %d rows' % nDone)
        if (simMetric == DataStructs.DiceSimilarity):
            scores = DataStructs.BulkDiceSimilarity(fp, validFps)
            for i, score in enumerate(scores):
                if score > simThresh:
                    nbrLists[validProbes[i]].Insert(score, nm)
        elif (simMetric == DataStructs.TanimotoSimilarity):
            scores = DataStructs.BulkTanimotoSimilarity(fp, validFps)
            for i, score in enumerate(scores):
                if score > simThresh:
                    nbrLists[validProbes[i]].Insert(score, nm)
        elif (simMetric == DataStructs.TverskySimilarity):
            av = float(kwargs.get('tverskyA', 0.5))
            bv = float(kwargs.get('tverskyB', 0.5))
            scores = DataStructs.BulkTverskySimilarity(fp, validFps, av, bv)
            for i, score in enumerate(scores):
                if score > simThresh:
                    nbrLists[validProbes[i]].Insert(score, nm)
        else:
            for i in range(len(probeFps)):
                pfp = probeFps[i]
                if pfp is not None:
                    score = simMetric(probeFps[i], fp)
                    if score > simThresh:
                        nbrLists[validProbes[i]].Insert(score, nm)
    return nbrLists
Example #8
0
    def eval_function(text):
        generated = ''.join(text)
        try:
            decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                          start='<s>',
                                                          end='</s>')
            DeepSMILESLanguageModelUtils.sanitize(decoded)
        except Exception:
            return 0
        extracted = DeepSMILESLanguageModelUtils.extract(generated,
                                                         start='<s>',
                                                         end='</s>')
        tokenized = DeepSMILESTokenizer(extracted)

        score = len(tokenized.get_tokens()) / (
            text_length - 1)  # provide more reward for longer text sequences

        logger.info("%s, %s" % (generated, str(score)))
        return score
Example #9
0
  def testCairoFile(self):
    try:
      from rdkit.Chem.Draw.cairoCanvas import Canvas
    except ImportError:
      logger.info("Skipping cairo test")
      return
    os.environ['RDKIT_CANVAS']='cairo'

    foo,fn=tempfile.mkstemp(suffix='.png')
    foo=None
    self.failUnlessEqual(os.path.getsize(fn),0)

    Draw.MolToFile(self.mol,fn)

    self.failIfEqual(os.path.getsize(fn),0)
    try:
      os.unlink(fn)
    except:
      pass
Example #10
0
    def testSpingFile(self):
        try:
            from rdkit.Chem.Draw.spingCanvas import Canvas
        except ImportError:
            logger.info("Skipping sping test")
            return
        os.environ['RDKIT_CANVAS'] = 'sping'

        foo, fn = tempfile.mkstemp(suffix='.png')
        foo = None
        self.assertEqual(os.path.getsize(fn), 0)

        Draw.MolToFile(self.mol, fn)

        self.assertNotEqual(os.path.getsize(fn), 0)
        try:
            os.unlink(fn)
        except Exception:
            pass
Example #11
0
def GetNeighborLists(probes,topN,pool,
                     simMetric=DataStructs.DiceSimilarity,
                     simThresh=-1.,
                     silent=False,
                     **kwargs):
  probeFps = [x[1] for x in probes]
  validProbes = [x for x in range(len(probeFps)) if probeFps[x] is not None]
  validFps=[probeFps[x] for x in validProbes]
  from rdkit.DataStructs.TopNContainer import TopNContainer
  if simThresh<=0:
    nbrLists = [TopNContainer(topN) for x in range(len(probeFps))]
  else:
    nbrLists=[TopNContainer(-1) for x in range(len(probeFps))]

  nDone=0
  for nm,fp in pool:
    nDone+=1
    if not silent and not nDone%1000: logger.info('  searched %d rows'%nDone)
    if(simMetric==DataStructs.DiceSimilarity):
      scores = DataStructs.BulkDiceSimilarity(fp,validFps)
      for i,score in enumerate(scores):
        if score>simThresh:
          nbrLists[validProbes[i]].Insert(score,nm)
    elif(simMetric==DataStructs.TanimotoSimilarity):
      scores = DataStructs.BulkTanimotoSimilarity(fp,validFps)
      for i,score in enumerate(scores):
        if score>simThresh:
          nbrLists[validProbes[i]].Insert(score,nm)
    elif(simMetric==DataStructs.TverskySimilarity):
      av = float(kwargs.get('tverskyA',0.5))
      bv = float(kwargs.get('tverskyB',0.5))
      scores = DataStructs.BulkTverskySimilarity(fp,validFps,av,bv)
      for i,score in enumerate(scores):
        if score>simThresh:
          nbrLists[validProbes[i]].Insert(score,nm)
    else:
      for i in range(len(probeFps)):
        pfp = probeFps[i]
        if pfp is not None:
          score = simMetric(probeFps[i],fp)
          if score>simThresh:
            nbrLists[validProbes[i]].Insert(score,nm)
  return nbrLists
Example #12
0
  def testSpingFile(self):
    try:
      from rdkit.Chem.Draw.spingCanvas import Canvas
    except ImportError:
      logger.info("Skipping sping test")
      return
    os.environ['RDKIT_CANVAS']='sping'

    foo,fn=tempfile.mkstemp(suffix='.png')
    foo=None
    self.assertEqual(os.path.getsize(fn),0)

    Draw.MolToFile(self.mol,fn)

    self.assertNotEqual(os.path.getsize(fn),0)
    try:
      os.unlink(fn)
    except Exception:
      pass
Example #13
0
    def testAggFile(self):
        try:
            from rdkit.Chem.Draw.aggCanvas import Canvas
        except ImportError:
            logger.info("Skipping agg test")
            return
        os.environ['RDKIT_CANVAS'] = 'agg'

        foo, fn = tempfile.mkstemp(suffix='.png')
        foo = None
        self.failUnlessEqual(os.path.getsize(fn), 0)

        Draw.MolToFile(self.mol, fn)

        self.failIfEqual(os.path.getsize(fn), 0)
        try:
            os.unlink(fn)
        except:
            pass
Example #14
0
    def testSpingFile(self):
        try:
            from rdkit.Chem.Draw.spingCanvas import Canvas
        except ImportError:
            logger.info("Skipping sping test")
            return
        os.environ["RDKIT_CANVAS"] = "sping"

        foo, fn = tempfile.mkstemp(suffix=".png")
        foo = None
        self.failUnlessEqual(os.path.getsize(fn), 0)

        Draw.MolToFile(self.mol, fn)

        self.failIfEqual(os.path.getsize(fn), 0)
        try:
            os.unlink(fn)
        except:
            pass
Example #15
0
    def eval_function(text):
        generated = ''.join(text)
        try:
            decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                          start='<s>',
                                                          end='</s>')
            smiles = DeepSMILESLanguageModelUtils.sanitize(decoded)
        except Exception:
            return -1.0

        global all_smiles

        if smiles in all_smiles:
            score = -1.0
        else:
            qedscore = qedscorer.score(smiles)
            score = qedscore / (1 + np.abs(qedscore))
            all_smiles[smiles] = qedscore

        logger.info("%s, %s" % (smiles, str(score)))
        return score
Example #16
0
parser.add_option('--maxPathLength','--max',default=8,type=int,
                  help='maximum length path for the fingerprint')
parser.add_option('--similarityThreshold','--sim',default=[0.9],type='floatlist',
                  help='threshold for similarity')
parser.add_option('--numNeighbors','--num','-n','-k',default=50,type=int,
                  help='number of neighbors to consider')
parser.add_option('--neighborsFile','--nbrs',default='',
                  help='name of an output file to hold the neighbor lists')
parser.add_option('--scan',default=False,action="store_true")


if __name__=='__main__':
    options,args = parser.parse_args()
    outF = file(args[-1],'w+')

    logger.info('reading training molecules and generating fingerprints')
    suppl = Chem.SDMolSupplier(args[0])
    train=[]
    for i,mol in enumerate(suppl):
        if not mol:
            continue
        smi = Chem.MolToSmiles(mol,True)
        nm = mol.GetProp(nameField)
        property = float(mol.GetProp(propField))
        fp = GetMolFingerprint(mol,options.maxPathLength)
        train.append((nm,smi,fp,property))
    logger.info('  got %d molecules'%len(train))

    if len(args)>2:
        suppl = Chem.SDMolSupplier(args[1])
        haveTest=True
Example #17
0
    "benchmark and test fingerprint screenout and substructure searching")
parser.add_argument("--validate",
                    dest='validateResults',
                    default=False,
                    action='store_true',
                    help="validate that the screenout isn't missing anything")
parser.add_argument("--short",
                    dest='doShort',
                    default=False,
                    action='store_true',
                    help="run a small subset of the molecules")
args = parser.parse_args()

ts = []

logger.info('mols from smiles')
mols = []
t1 = time.time()
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/chembl21_25K.pairs.txt.gz
with gzip.open('../Data/chembl21_25K.pairs.txt.gz', 'rb') as inf:
    for line in inf:
        line = line.decode().strip().split()
        smi1 = line[1]
        smi2 = line[3]
        mols.append(Chem.MolFromSmiles(smi1))
        mols.append(Chem.MolFromSmiles(smi2))
        if args.doShort and len(mols) >= 1000:
            break
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds, {len(mols)} mols')
Example #18
0
from rdkit.Chem import AllChem
from rdkit.Chem import Recap
from rdkit.RDLogger import logger
logger = logger()

tests=[1]*1001
if len(sys.argv)>1:
    tests=[0]*1001
    tests[1]=1
    for x in sys.argv[1:]:
        x = int(x)
        tests[x] = 1
ts = []

sdData = gzip.open('../Data/mols.1000.sdf.gz','rb').read()
logger.info('mols from sdf')
suppl = Chem.SDMolSupplier()
suppl.SetData(sdData)
mols = []
nMols=0
nBad=0
t1=time.time()
for m in suppl:
    if m:
        nMols+=1
        mols.append(m)
    else:
        nBad += 1
t2=time.time()
logger.info('Results1: %.2f seconds, %d passed, %d failed'%(t2-t1,nMols,nBad))
ts.append(t2-t1)
Example #19
0
# propField is the name of the property (from the SD file) you want to use
# as the "activity"
propField='chemical_shift_1'

# similarity threshold for a pair to be considered interesting.
# (i.e. pairs with a similiarity below this value will not be
# added to the output.
similarityThreshold=0.5

if __name__=='__main__':
    suppl = Chem.SDMolSupplier(sys.argv[1])
    outF = file(sys.argv[2],'w+')

    data=[]
    logger.info('reading molecules and generating fingeprints')
    for i,mol in enumerate(suppl):
        if not mol:
            continue
        smi = Chem.MolToSmiles(mol,True)
        nm = mol.GetProp(nameField)
        property = float(mol.GetProp(propField))
        fp = GetMolFingerprint(mol,maxPathLength)
        data.append((nm,smi,property,fp))
        
    logger.info('  got %d molecules'%len(data))

    logger.info('calculating pairs')
    pairs = []
    for i in range(len(data)):
        for j in range(i+1,len(data)):
Example #20
0
def CreateDb(options,dataFilename='',supplier=None):
  if not dataFilename and supplier is None:
    raise ValueError('Please provide either a data filename or a supplier')

  if options.errFilename:
    errFile=open(os.path.join(options.outDir,options.errFilename),'w+')
  else:
    errFile=None

  if options.noExtras:
    options.doPairs=False
    options.doDescriptors=False
    options.doFingerprints=False
    options.doPharm2D=False
    options.doGobbi2D=False
    options.doLayered=False
    options.doMorganFps=False

  if options.loadMols:
    if supplier is None:
      if not options.molFormat:
        ext = os.path.splitext(dataFilename)[-1].lower()
        if ext=='.sdf':
          options.molFormat='sdf'
        elif ext in ('.smi','.smiles','.txt','.csv'):
          options.molFormat='smiles'
          if not options.delimiter:
            # guess the delimiter
            import csv
            sniffer = csv.Sniffer()
            dlct=sniffer.sniff(open(dataFilename,'r').read(2000))
            options.delimiter=dlct.delimiter
            if not options.silent:
              logger.info('Guessing that delimiter is %s. Use --delimiter argument if this is wrong.'%repr(options.delimiter))

        if not options.silent:
          logger.info('Guessing that mol format is %s. Use --molFormat argument if this is wrong.'%repr(options.molFormat))  
      if options.molFormat=='smiles':
        if options.delimiter=='\\t': options.delimiter='\t'
        supplier=Chem.SmilesMolSupplier(dataFilename,
                                        titleLine=options.titleLine,
                                        delimiter=options.delimiter,
                                        smilesColumn=options.smilesColumn,
                                        nameColumn=options.nameColumn
                                        )
      else:
        supplier = Chem.SDMolSupplier(dataFilename)
    if not options.silent: logger.info('Reading molecules and constructing molecular database.')
    Loader.LoadDb(supplier,os.path.join(options.outDir,options.molDbName),
                  errorsTo=errFile,regName=options.regName,nameCol=options.molIdName,
                  skipProps=options.skipProps,defaultVal=options.missingPropertyVal,
                  addComputedProps=options.addProps,uniqNames=True,
                  skipSmiles=options.skipSmiles,maxRowsCached=int(options.maxRowsCached),
                  silent=options.silent,nameProp=options.nameProp,
                  lazySupplier=int(options.maxRowsCached)>0,
                  startAnew=not options.updateDb
                  )

  if options.doPairs:
    pairConn = DbConnect(os.path.join(options.outDir,options.pairDbName))
    pairCurs = pairConn.GetCursor()
    try:
      pairCurs.execute('drop table %s'%(options.pairTableName))
    except:
      pass
    pairCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,atompairfp blob,torsionfp blob)'%(options.pairTableName,
                                                                                                         options.molIdName))

  if options.doFingerprints or options.doPharm2D or options.doGobbi2D or options.doLayered:
    fpConn = DbConnect(os.path.join(options.outDir,options.fpDbName))
    fpCurs=fpConn.GetCursor()
    try:
      fpCurs.execute('drop table %s'%(options.fpTableName))
    except:
      pass
    try:
      fpCurs.execute('drop table %s'%(options.pharm2DTableName))
    except:
      pass
    try:
      fpCurs.execute('drop table %s'%(options.gobbi2DTableName))
    except:
      pass
    try:
      fpCurs.execute('drop table %s'%(options.layeredTableName))
    except:
      pass

    if options.doFingerprints:
      fpCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,rdkfp blob)'%(options.fpTableName,
                                                                                     options.molIdName))
    if options.doLayered:
      layeredQs = ','.join('?'*LayeredOptions.nWords)
      colDefs=','.join(['Col_%d integer'%(x+1) for x in range(LayeredOptions.nWords)])
      fpCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,%s)'%(options.layeredTableName,
                                                                             options.molIdName,
                                                                             colDefs))
      
    if options.doPharm2D:
      fpCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,pharm2dfp blob)'%(options.pharm2DTableName,
                                                                                     options.molIdName))
      sigFactory = BuildSigFactory(options)
    if options.doGobbi2D:
      fpCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,gobbi2dfp blob)'%(options.gobbi2DTableName,
                                                                                     options.molIdName))
      from rdkit.Chem.Pharm2D import Generate,Gobbi_Pharm2D

  if options.doMorganFps :
    fpConn = DbConnect(os.path.join(options.outDir,options.fpDbName))
    fpCurs=fpConn.GetCursor()
    try:
      fpCurs.execute('drop table %s'%(options.morganFpTableName))
    except:
      pass
    fpCurs.execute('create table %s (guid integer not null primary key,%s varchar not null unique,morganfp blob)'%(options.morganFpTableName,
                                                                                        options.molIdName))

  if options.doDescriptors:
    descrConn=DbConnect(os.path.join(options.outDir,options.descrDbName))
    with open(options.descriptorCalcFilename,'r') as inTF:
      buf = inTF.read().replace('\r\n', '\n').encode('utf-8')
      inTF.close()
    calc = cPickle.load(io.BytesIO(buf))
    nms = [x for x in calc.GetDescriptorNames()]
    descrCurs = descrConn.GetCursor()
    descrs = ['guid integer not null primary key','%s varchar not null unique'%options.molIdName]
    descrs.extend(['%s float'%x for x in nms])
    try:
      descrCurs.execute('drop table %s'%(options.descrTableName))
    except:
      pass
    descrCurs.execute('create table %s (%s)'%(options.descrTableName,','.join(descrs)))
    descrQuery=','.join([DbModule.placeHolder]*len(descrs))
  pairRows = []
  fpRows = []
  layeredRows = []
  descrRows = []
  pharm2DRows=[]
  gobbi2DRows=[]
  morganRows = []

  if not options.silent: logger.info('Generating fingerprints and descriptors:')
  molConn = DbConnect(os.path.join(options.outDir,options.molDbName))
  molCurs = molConn.GetCursor()
  if not options.skipSmiles:
    molCurs.execute('select guid,%s,smiles,molpkl from %s'%(options.molIdName,options.regName))
  else:
    molCurs.execute('select guid,%s,molpkl from %s'%(options.molIdName,options.regName))
  i=0
  while 1:
    try:
      tpl = molCurs.fetchone()
      molGuid = tpl[0]
      molId = tpl[1]
      pkl = tpl[-1]
      i+=1
    except:
      break
    if isinstance(pkl,(bytes,str)):
      mol = Chem.Mol(pkl)
    else:
      mol = Chem.Mol(str(pkl))
    if not mol: continue
     
    if options.doPairs:
      pairs = FingerprintUtils.BuildAtomPairFP(mol)
      torsions = FingerprintUtils.BuildTorsionsFP(mol)
      pkl1 = DbModule.binaryHolder(pairs.ToBinary())
      pkl2 = DbModule.binaryHolder(torsions.ToBinary())
      row = (molGuid,molId,pkl1,pkl2)
      pairRows.append(row)
    if options.doFingerprints:
      fp2 = FingerprintUtils.BuildRDKitFP(mol)
      pkl = DbModule.binaryHolder(fp2.ToBinary())
      row = (molGuid,molId,pkl)
      fpRows.append(row)
    if options.doLayered:
      words = LayeredOptions.GetWords(mol)
      row = [molGuid,molId]+words
      layeredRows.append(row)
    if options.doDescriptors:
      descrs= calc.CalcDescriptors(mol)
      row = [molGuid,molId]
      row.extend(descrs)
      descrRows.append(row)
    if options.doPharm2D:
      FingerprintUtils.sigFactory=sigFactory
      fp= FingerprintUtils.BuildPharm2DFP(mol)
      pkl = DbModule.binaryHolder(fp.ToBinary())
      row = (molGuid,molId,pkl)
      pharm2DRows.append(row)
    if options.doGobbi2D:
      FingerprintUtils.sigFactory=Gobbi_Pharm2D.factory
      fp= FingerprintUtils.BuildPharm2DFP(mol)
      pkl = DbModule.binaryHolder(fp.ToBinary())
      row = (molGuid,molId,pkl)
      gobbi2DRows.append(row)
    if options.doMorganFps:
      morgan = FingerprintUtils.BuildMorganFP(mol)
      pkl = DbModule.binaryHolder(morgan.ToBinary())
      row = (molGuid,molId,pkl)
      morganRows.append(row)

    if not i%500:
      if len(pairRows):
        pairCurs.executemany('insert into %s values (?,?,?,?)'%options.pairTableName,
                             pairRows)
        pairRows = []
        pairConn.Commit()
      if len(fpRows):
        fpCurs.executemany('insert into %s values (?,?,?)'%options.fpTableName,
                           fpRows)
        fpRows = []
        fpConn.Commit()
      if len(layeredRows):
        fpCurs.executemany('insert into %s values (?,?,%s)'%(options.layeredTableName,layeredQs),
                           layeredRows)
        layeredRows = []
        fpConn.Commit()
      if len(descrRows):
        descrCurs.executemany('insert into %s values (%s)'%(options.descrTableName,descrQuery),
                              descrRows)
        descrRows = []
        descrConn.Commit()
      if len(pharm2DRows):
        fpCurs.executemany('insert into %s values (?,?,?)'%options.pharm2DTableName,
                           pharm2DRows)
        pharm2DRows = []
        fpConn.Commit()
      if len(gobbi2DRows):
        fpCurs.executemany('insert into %s values (?,?,?)'%options.gobbi2DTableName,
                           gobbi2DRows)
        gobbi2DRows = []
        fpConn.Commit()
      if len(morganRows):
        fpCurs.executemany('insert into %s values (?,?,?)'%options.morganFpTableName,
                             morganRows)
        morganRows = []
        fpConn.Commit()
        
    if not options.silent and not i%500: 
      logger.info('  Done: %d'%(i))

  if len(pairRows):
    pairCurs.executemany('insert into %s values (?,?,?,?)'%options.pairTableName,
                         pairRows)
    pairRows = []
    pairConn.Commit()
  if len(fpRows):
    fpCurs.executemany('insert into %s values (?,?,?)'%options.fpTableName,
                       fpRows)
    fpRows = []
    fpConn.Commit()
  if len(layeredRows):
    fpCurs.executemany('insert into %s values (?,?,%s)'%(options.layeredTableName,layeredQs),
                       layeredRows)
    layeredRows = []
    fpConn.Commit()
  if len(descrRows):
    descrCurs.executemany('insert into %s values (%s)'%(options.descrTableName,descrQuery),
                          descrRows)
    descrRows = []
    descrConn.Commit()
  if len(pharm2DRows):
    fpCurs.executemany('insert into %s values (?,?,?)'%options.pharm2DTableName,
                       pharm2DRows)
    pharm2DRows = []
    fpConn.Commit()
  if len(gobbi2DRows):
    fpCurs.executemany('insert into %s values (?,?,?)'%options.gobbi2DTableName,
                       gobbi2DRows)
    gobbi2DRows = []
    fpConn.Commit()
  if len(morganRows):
    fpCurs.executemany('insert into %s values (?,?,?)'%options.morganFpTableName,
                       morganRows)
    morganRows = []
    fpConn.Commit()
    
  if not options.silent:
    logger.info('Finished.')
Example #21
0
from rdkit.Chem import AllChem
from rdkit.Chem import Recap
from rdkit.RDLogger import logger
logger = logger()

tests = [1] * 1001
if len(sys.argv) > 1:
    tests = [0] * 1001
    tests[1] = 1
    for x in sys.argv[1:]:
        x = int(x)
        tests[x] = 1
ts = []

sdData = gzip.open('../Data/mols.1000.sdf.gz', 'rb').read()
logger.info('mols from sdf')
suppl = Chem.SDMolSupplier()
suppl.SetData(sdData)
mols = []
nMols = 0
nBad = 0
t1 = time.time()
for m in suppl:
    if m:
        nMols += 1
        mols.append(m)
    else:
        nBad += 1
t2 = time.time()
logger.info('Results1: %.2f seconds, %d passed, %d failed' %
            (t2 - t1, nMols, nBad))
Example #22
0
lm = EmptyDeepSMILESLanguageModel(vocab, n=6)

current_best_score = None
current_best_smiles = None
beats_current = lambda score: score < current_best_score

for i in range(1000):
    generated = lm.generate(num_chars=25, text_seed="<s>")
    try:

        decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                      start='<s>',
                                                      end='</s>')
        sanitized = DeepSMILESLanguageModelUtils.sanitize(decoded)

        mol = Chem.MolFromSmiles(sanitized)
        logp_score = MolLogP(mol)

        logger.info("successful: %s , score: %s" %
                    (sanitized, str(logp_score)))

        if current_best_score is None or beats_current(logp_score):
            current_best_score = logp_score
            current_best_smiles = sanitized

    except Exception as e:
        pass

logger.info("best: %s , score: %s" %
            (current_best_smiles, str(current_best_score)))
Example #23
0
from rdkit.Chem import AllChem
from rdkit.Chem import Recap
from rdkit.RDLogger import logger
logger = logger()

tests=[1]*1001
if len(sys.argv)>1:
    tests=[0]*1001
    tests[1]=1
    for x in sys.argv[1:]:
        x = int(x)
        tests[x] = 1
ts = []
mols = []
lines = gzip.open('../Data/znp.50k.smi.gz','rb').readlines()
logger.info('mols from smiles')
nMols=0
nBad=0
t1=time.time()
for line in lines:
    line = line.strip().split(' ')
    m = Chem.MolFromSmiles(line[0])
    if m:
        nMols+=1
        mols.append(m)
    else:
        nBad += 1
        
t2=time.time()
logger.info('Results1: %.2f seconds, %d passed, %d failed'%(t2-t1,nMols,nBad))
ts.append(t2-t1)
Example #24
0
def RunSearch(options,queryFilename):
  global sigFactory
  if options.similarityType=='AtomPairs':
    fpBuilder=FingerprintUtils.BuildAtomPairFP
    simMetric=DataStructs.DiceSimilarity
    dbName = os.path.join(options.dbDir,options.pairDbName)
    fpTableName = options.pairTableName
    fpColName = options.pairColName
  elif options.similarityType=='TopologicalTorsions':
    fpBuilder=FingerprintUtils.BuildTorsionsFP
    simMetric=DataStructs.DiceSimilarity
    dbName = os.path.join(options.dbDir,options.torsionsDbName)
    fpTableName = options.torsionsTableName
    fpColName = options.torsionsColName
  elif options.similarityType=='RDK':
    fpBuilder=FingerprintUtils.BuildRDKitFP
    simMetric=DataStructs.FingerprintSimilarity
    dbName = os.path.join(options.dbDir,options.fpDbName)
    fpTableName = options.fpTableName
    if not options.fpColName:
      options.fpColName='rdkfp'
    fpColName = options.fpColName
  elif options.similarityType=='Pharm2D':
    fpBuilder=FingerprintUtils.BuildPharm2DFP
    simMetric=DataStructs.DiceSimilarity
    dbName = os.path.join(options.dbDir,options.fpDbName)
    fpTableName = options.pharm2DTableName
    if not options.fpColName:
      options.fpColName='pharm2dfp'
    fpColName = options.fpColName
    FingerprintUtils.sigFactory = BuildSigFactory(options)
  elif options.similarityType=='Gobbi2D':
    from rdkit.Chem.Pharm2D import Gobbi_Pharm2D
    fpBuilder=FingerprintUtils.BuildPharm2DFP
    simMetric=DataStructs.TanimotoSimilarity
    dbName = os.path.join(options.dbDir,options.fpDbName)
    fpTableName = options.gobbi2DTableName
    if not options.fpColName:
      options.fpColName='gobbi2dfp'
    fpColName = options.fpColName
    FingerprintUtils.sigFactory = Gobbi_Pharm2D.factory
  elif options.similarityType=='Morgan':
    fpBuilder=FingerprintUtils.BuildMorganFP
    simMetric=DataStructs.DiceSimilarity
    dbName = os.path.join(options.dbDir,options.morganFpDbName)
    fpTableName = options.morganFpTableName
    fpColName = options.morganFpColName


  extraArgs={}
  if options.similarityMetric=='tanimoto':
    simMetric = DataStructs.TanimotoSimilarity
  elif options.similarityMetric=='dice':
    simMetric = DataStructs.DiceSimilarity
  elif options.similarityMetric=='tversky':
    simMetric = DataStructs.TverskySimilarity
    extraArgs['tverskyA']=options.tverskyA
    extraArgs['tverskyB']=options.tverskyB

  if options.smilesQuery:
    mol=Chem.MolFromSmiles(options.smilesQuery)
    if not mol:
      logger.error('could not build query molecule from smiles "%s"'%options.smilesQuery)
      sys.exit(-1)
    options.queryMol = mol
  elif options.smartsQuery:
    mol=Chem.MolFromSmarts(options.smartsQuery)
    if not mol:
      logger.error('could not build query molecule from smarts "%s"'%options.smartsQuery)
      sys.exit(-1)
    options.queryMol = mol

  if options.outF=='-':
    outF=sys.stdout
  elif options.outF=='':
    outF=None
  else:
    outF = file(options.outF,'w+')
  
  molsOut=False
  if options.sdfOut:
    molsOut=True
    if options.sdfOut=='-':
      sdfOut=sys.stdout
    else:
      sdfOut = file(options.sdfOut,'w+')
  else:
    sdfOut=None
  if options.smilesOut:
    molsOut=True
    if options.smilesOut=='-':
      smilesOut=sys.stdout
    else:
      smilesOut = file(options.smilesOut,'w+')
  else:
    smilesOut=None

  if queryFilename:
    try:
      tmpF = file(queryFilename,'r')
    except IOError:
      logger.error('could not open query file %s'%queryFilename)
      sys.exit(1)

    if options.molFormat=='smiles':
      func=GetMolsFromSmilesFile
    elif options.molFormat=='sdf':
      func=GetMolsFromSDFile

    if not options.silent:
      msg='Reading query molecules'
      if fpBuilder: msg+=' and generating fingerprints'
      logger.info(msg)
    probes=[]
    i=0
    nms=[]
    for nm,smi,mol in func(queryFilename,None,options.nameProp):
      i+=1
      nms.append(nm)
      if not mol:
        logger.error('query molecule %d could not be built'%(i))
        probes.append((None,None))
        continue
      if fpBuilder:
        probes.append((mol,fpBuilder(mol)))
      else:
        probes.append((mol,None))
      if not options.silent and not i%1000:
        logger.info("  done %d"%i)
  else:
    probes=None

  conn=None
  idName = options.molIdName
  ids=None
  names=None
  molDbName = os.path.join(options.dbDir,options.molDbName)
  molIdName = options.molIdName
  mConn = DbConnect(molDbName)
  cns = [(x.lower(),y) for x,y in mConn.GetColumnNamesAndTypes('molecules')]
  idCol,idTyp=cns[0]
  if options.propQuery or options.queryMol:
    conn = DbConnect(molDbName)
    curs = conn.GetCursor()
    if options.queryMol:
      if not options.silent: logger.info('Doing substructure query')
      if options.propQuery:
        where='where %s'%options.propQuery
      else:
        where=''
      if not options.silent:
        curs.execute('select count(*) from molecules %(where)s'%locals())
        nToDo = curs.fetchone()[0]

      join=''        
      doSubstructFPs=False
      fpDbName = os.path.join(options.dbDir,options.fpDbName)
      if os.path.exists(fpDbName) and not options.negateQuery :
        curs.execute("attach database '%s' as fpdb"%(fpDbName))
        try:
          curs.execute('select * from fpdb.%s limit 1'%options.layeredTableName)
        except:
          pass
        else:
          doSubstructFPs=True
          join = 'join fpdb.%s using (%s)'%(options.layeredTableName,idCol)
          query = LayeredOptions.GetQueryText(options.queryMol)
          if query:
            if not where:
              where='where'
            else:
              where += ' and'
            where += ' '+query

      cmd = 'select %(idCol)s,molpkl from molecules %(join)s %(where)s'%locals()
      curs.execute(cmd)
      row=curs.fetchone()
      nDone=0
      ids=[]
      while row:
        id,molpkl = row
        if not options.zipMols:
          m = Chem.Mol(str(molpkl))
        else:
          m = Chem.Mol(zlib.decompress(str(molpkl)))
        matched=m.HasSubstructMatch(options.queryMol)
        if options.negateQuery:
          matched = not matched
        if matched:
          ids.append(id)
        nDone+=1
        if not options.silent and not nDone%500:
          if not doSubstructFPs:
            logger.info('  searched %d (of %d) molecules; %d hits so far'%(nDone,nToDo,len(ids)))
          else:
            logger.info('  searched through %d molecules; %d hits so far'%(nDone,len(ids)))
        row=curs.fetchone()
      if not options.silent and doSubstructFPs and nToDo:
        nFiltered = nToDo-nDone
        logger.info('   Fingerprint screenout rate: %d of %d (%%%.2f)'%(nFiltered,nToDo,100.*nFiltered/nToDo))

    elif options.propQuery:
      if not options.silent: logger.info('Doing property query')
      propQuery=options.propQuery.split(';')[0]
      curs.execute('select %(idCol)s from molecules where %(propQuery)s'%locals())
      ids = [x[0] for x in curs.fetchall()]
    if not options.silent:
      logger.info('Found %d molecules matching the query'%(len(ids)))

  t1=time.time()
  if probes:
    if not options.silent: logger.info('Finding Neighbors')
    conn = DbConnect(dbName)
    cns = conn.GetColumnNames(fpTableName)
    curs = conn.GetCursor()

    if ids:
      ids = [(x,) for x in ids]
      curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)'%locals())
      curs.executemany('insert into _tmpTbl values (?)',ids)
      join='join  _tmpTbl using (%(idCol)s)'%locals()
    else:
      join=''

    if cns[0].lower() != idCol.lower():
      # backwards compatibility to the days when mol tables had a guid and
      # the fps tables did not:
      curs.execute("attach database '%(molDbName)s' as mols"%locals())
      curs.execute("""
  select %(idCol)s,%(fpColName)s from %(fpTableName)s join
      (select %(idCol)s,%(molIdName)s from mols.molecules %(join)s)
    using (%(molIdName)s)
"""%(locals()))
    else:
      curs.execute('select %(idCol)s,%(fpColName)s from %(fpTableName)s %(join)s'%locals())
    def poolFromCurs(curs,similarityMethod):
      row = curs.fetchone()
      while row:
        id,pkl = row
        fp = DepickleFP(str(pkl),similarityMethod)
        yield (id,fp)
        row = curs.fetchone()
    topNLists = GetNeighborLists(probes,options.topN,poolFromCurs(curs,options.similarityType),
                                 simMetric=simMetric,simThresh=options.simThresh,**extraArgs)
    uniqIds=set()
    nbrLists = {}
    for i,nm in enumerate(nms):
      topNLists[i].reverse()
      scores=topNLists[i].GetPts()
      nbrNames = topNLists[i].GetExtras()
      nbrs = []
      for j,nbrGuid in enumerate(nbrNames):
        if nbrGuid is None:
          break
        else:
          uniqIds.add(nbrGuid)
          nbrs.append((nbrGuid,scores[j]))
      nbrLists[(i,nm)] = nbrs
    t2=time.time()
    if not options.silent: logger.info('The search took %.1f seconds'%(t2-t1))
    
    if not options.silent: logger.info('Creating output')

    
    curs = mConn.GetCursor()
    ids = list(uniqIds)

    ids = [(x,) for x in ids]
    curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)'%locals())
    curs.executemany('insert into _tmpTbl values (?)',ids)
    curs.execute('select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)'%locals())
    nmDict={}
    for guid,id in curs.fetchall():
      nmDict[guid]=str(id)
    
    ks = nbrLists.keys()
    ks.sort()
    if not options.transpose:
      for i,nm in ks:
        nbrs= nbrLists[(i,nm)]
        nbrTxt=options.outputDelim.join([nm]+['%s%s%.3f'%(nmDict[id],options.outputDelim,score) for id,score in nbrs])
        if outF: print >>outF,nbrTxt
    else:
      labels = ['%s%sSimilarity'%(x[1],options.outputDelim) for x in ks]
      if outF: print >>outF,options.outputDelim.join(labels)
      for i in range(options.topN):
        outL = []
        for idx,nm in ks:
          nbr = nbrLists[(idx,nm)][i]
          outL.append(nmDict[nbr[0]])
          outL.append('%.3f'%nbr[1])
        if outF: print >>outF,options.outputDelim.join(outL)
  else:
    if not options.silent: logger.info('Creating output')
    curs = mConn.GetCursor()
    ids = [(x,) for x in set(ids)]
    curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)'%locals())
    curs.executemany('insert into _tmpTbl values (?)',ids)
    molIdName = options.molIdName
    curs.execute('select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)'%locals())
    nmDict={}
    for guid,id in curs.fetchall():
      nmDict[guid]=str(id)
    if outF: print >>outF,'\n'.join(nmDict.values())
  if molsOut and ids:
    molDbName = os.path.join(options.dbDir,options.molDbName)
    cns = [x.lower() for x in mConn.GetColumnNames('molecules')]
    if cns[-1]!='molpkl':
      cns.remove('molpkl')
      cns.append('molpkl')

    curs = mConn.GetCursor()
    #curs.execute('create temporary table _tmpTbl (guid integer)'%locals())
    #curs.executemany('insert into _tmpTbl values (?)',ids)
    cnText=','.join(cns)
    curs.execute('select %(cnText)s from molecules join _tmpTbl using (%(idCol)s)'%locals())

    row=curs.fetchone()
    molD = {}
    while row:
      row = list(row)
      pkl = row[-1]
      m = Chem.Mol(str(pkl))
      guid = row[0]
      nm = nmDict[guid]
      if sdfOut:
        m.SetProp('_Name',nm)
        print >>sdfOut,Chem.MolToMolBlock(m)
        for i in range(1,len(cns)-1):
          pn = cns[i]
          pv = str(row[i])
          print >>sdfOut,'> <%s>\n%s\n'%(pn,pv)
        print >>sdfOut,'$$$$'
      if smilesOut:
        smi=Chem.MolToSmiles(m,options.chiralSmiles)        
      if smilesOut:
        print >>smilesOut,'%s %s'%(smi,str(row[1]))
      row=curs.fetchone()
  if not options.silent: logger.info('Done!')
Example #25
0
    TYPE_CHECKER = copy.copy(Option.TYPE_CHECKER)
    TYPE_CHECKER["floatlist"] = check_floatlist


parser = OptionParser("distance predict", version="%prog", option_class=MyOption)
parser.add_option("--maxPathLength", "--max", default=8, type=int, help="maximum length path for the fingerprint")
parser.add_option("--similarityThreshold", "--sim", default=[0.9], type="floatlist", help="threshold for similarity")
parser.add_option("--numNeighbors", "--num", "-n", "-k", default=50, type=int, help="number of neighbors to consider")
parser.add_option("--neighborsFile", "--nbrs", default="", help="name of an output file to hold the neighbor lists")
parser.add_option("--scan", default=False, action="store_true")

if __name__ == "__main__":
    options, args = parser.parse_args()
    outF = file(args[-1], "w+")

    logger.info("reading training molecules and generating fingerprints")
    suppl = Chem.SDMolSupplier(args[0])
    train = []
    for i, mol in enumerate(suppl):
        if not mol:
            continue
        smi = Chem.MolToSmiles(mol, True)
        nm = mol.GetProp(nameField)
        property = float(mol.GetProp(propField))
        fp = GetMolFingerprint(mol, options.maxPathLength)
        train.append((nm, smi, fp, property))
    logger.info("  got %d molecules" % len(train))

    if len(args) > 2:
        suppl = Chem.SDMolSupplier(args[1])
        haveTest = True
from rdkit.RDLogger import logger

from chemgrams import get_arpa_vocab, KenLMDeepSMILESLanguageModel, DeepSMILESLanguageModelUtils, DeepSMILESTokenizer, \
    LanguageModelMCTSWithUCB1

from rdkit import rdBase
rdBase.DisableLog('rdApp.error')
rdBase.DisableLog('rdApp.warning')

logger = logger()

if __name__ == '__main__':

    logger.info("loading language model...")
    vocab = get_arpa_vocab(
        '../resources/chemts_250k_deepsmiles_klm_10gram_200429.arpa')
    lm = KenLMDeepSMILESLanguageModel(
        '../resources/chemts_250k_deepsmiles_klm_10gram_200429.klm', vocab)

    num_simulations = 1000
    width = 3
    text_length = 25
    start_state = ["<s>"]

    def eval_function(text):
        generated = ''.join(text)
        try:
            decoded = DeepSMILESLanguageModelUtils.decode(generated,
                                                          start='<s>',
                                                          end='</s>')
            DeepSMILESLanguageModelUtils.sanitize(decoded)
Example #27
0
def CreateDb(options, dataFilename="", supplier=None):
    if not dataFilename and supplier is None:
        raise ValueError, "Please provide either a data filename or a supplier"

    if options.errFilename:
        errFile = file(os.path.join(options.outDir, options.errFilename), "w+")
    else:
        errFile = None

    if options.noExtras:
        options.doPairs = False
        options.doDescriptors = False
        options.doFingerprints = False
        options.doPharm2D = False
        options.doGobbi2D = False
        options.doLayered = False
        options.doMorganFps = False

    if options.loadMols:
        if supplier is None:
            if not options.molFormat:
                ext = os.path.splitext(dataFilename)[-1].lower()
                if ext == ".sdf":
                    options.molFormat = "sdf"
                elif ext in (".smi", ".smiles", ".txt", ".csv"):
                    options.molFormat = "smiles"
                    if not options.delimiter:
                        # guess the delimiter
                        import csv

                        sniffer = csv.Sniffer()
                        dlct = sniffer.sniff(file(dataFilename, "r").read(2000))
                        options.delimiter = dlct.delimiter
                        if not options.silent:
                            logger.info(
                                "Guessing that delimiter is %s. Use --delimiter argument if this is wrong."
                                % repr(options.delimiter)
                            )

                if not options.silent:
                    logger.info(
                        "Guessing that mol format is %s. Use --molFormat argument if this is wrong."
                        % repr(options.molFormat)
                    )
            if options.molFormat == "smiles":
                if options.delimiter == "\\t":
                    options.delimiter = "\t"
                supplier = Chem.SmilesMolSupplier(
                    dataFilename,
                    titleLine=options.titleLine,
                    delimiter=options.delimiter,
                    smilesColumn=options.smilesColumn,
                    nameColumn=options.nameColumn,
                )
            else:
                supplier = Chem.SDMolSupplier(dataFilename)
        if not options.silent:
            logger.info("Reading molecules and constructing molecular database.")
        Loader.LoadDb(
            supplier,
            os.path.join(options.outDir, options.molDbName),
            errorsTo=errFile,
            regName=options.regName,
            nameCol=options.molIdName,
            skipProps=options.skipProps,
            defaultVal=options.missingPropertyVal,
            addComputedProps=options.addProps,
            uniqNames=True,
            skipSmiles=options.skipSmiles,
            maxRowsCached=int(options.maxRowsCached),
            silent=options.silent,
            nameProp=options.nameProp,
            lazySupplier=int(options.maxRowsCached) > 0,
        )
    if options.doPairs:
        pairConn = DbConnect(os.path.join(options.outDir, options.pairDbName))
        pairCurs = pairConn.GetCursor()
        try:
            pairCurs.execute("drop table %s" % (options.pairTableName))
        except:
            pass
        pairCurs.execute(
            "create table %s (guid integer not null primary key,%s varchar not null unique,atompairfp blob,torsionfp blob)"
            % (options.pairTableName, options.molIdName)
        )

    if options.doFingerprints or options.doPharm2D or options.doGobbi2D or options.doLayered:
        fpConn = DbConnect(os.path.join(options.outDir, options.fpDbName))
        fpCurs = fpConn.GetCursor()
        try:
            fpCurs.execute("drop table %s" % (options.fpTableName))
        except:
            pass
        try:
            fpCurs.execute("drop table %s" % (options.pharm2DTableName))
        except:
            pass
        try:
            fpCurs.execute("drop table %s" % (options.gobbi2DTableName))
        except:
            pass
        try:
            fpCurs.execute("drop table %s" % (options.layeredTableName))
        except:
            pass

        if options.doFingerprints:
            fpCurs.execute(
                "create table %s (guid integer not null primary key,%s varchar not null unique,rdkfp blob)"
                % (options.fpTableName, options.molIdName)
            )
        if options.doLayered:
            layeredQs = ",".join("?" * LayeredOptions.nWords)
            colDefs = ",".join(["Col_%d integer" % (x + 1) for x in range(LayeredOptions.nWords)])
            fpCurs.execute(
                "create table %s (guid integer not null primary key,%s varchar not null unique,%s)"
                % (options.layeredTableName, options.molIdName, colDefs)
            )

        if options.doPharm2D:
            fpCurs.execute(
                "create table %s (guid integer not null primary key,%s varchar not null unique,pharm2dfp blob)"
                % (options.pharm2DTableName, options.molIdName)
            )
            sigFactory = BuildSigFactory(options)
        if options.doGobbi2D:
            fpCurs.execute(
                "create table %s (guid integer not null primary key,%s varchar not null unique,gobbi2dfp blob)"
                % (options.gobbi2DTableName, options.molIdName)
            )
            from rdkit.Chem.Pharm2D import Generate, Gobbi_Pharm2D

    if options.doMorganFps:
        fpConn = DbConnect(os.path.join(options.outDir, options.fpDbName))
        fpCurs = fpConn.GetCursor()
        try:
            fpCurs.execute("drop table %s" % (options.morganFpTableName))
        except:
            pass
        fpCurs.execute(
            "create table %s (guid integer not null primary key,%s varchar not null unique,morganfp blob)"
            % (options.morganFpTableName, options.molIdName)
        )

    if options.doDescriptors:
        descrConn = DbConnect(os.path.join(options.outDir, options.descrDbName))
        calc = cPickle.load(file(options.descriptorCalcFilename, "rb"))
        nms = [x for x in calc.GetDescriptorNames()]
        descrCurs = descrConn.GetCursor()
        descrs = ["guid integer not null primary key", "%s varchar not null unique" % options.molIdName]
        descrs.extend(["%s float" % x for x in nms])
        try:
            descrCurs.execute("drop table %s" % (options.descrTableName))
        except:
            pass
        descrCurs.execute("create table %s (%s)" % (options.descrTableName, ",".join(descrs)))
        descrQuery = ",".join([DbModule.placeHolder] * len(descrs))
    pairRows = []
    fpRows = []
    layeredRows = []
    descrRows = []
    pharm2DRows = []
    gobbi2DRows = []
    morganRows = []

    if not options.silent:
        logger.info("Generating fingerprints and descriptors:")
    molConn = DbConnect(os.path.join(options.outDir, options.molDbName))
    molCurs = molConn.GetCursor()
    if not options.skipSmiles:
        molCurs.execute("select guid,%s,smiles,molpkl from %s" % (options.molIdName, options.regName))
    else:
        molCurs.execute("select guid,%s,molpkl from %s" % (options.molIdName, options.regName))
    i = 0
    while 1:
        try:
            tpl = molCurs.fetchone()
            molGuid = tpl[0]
            molId = tpl[1]
            pkl = tpl[-1]
            i += 1
        except:
            break
        mol = Chem.Mol(str(pkl))
        if not mol:
            continue

        if options.doPairs:
            pairs = FingerprintUtils.BuildAtomPairFP(mol)
            torsions = FingerprintUtils.BuildTorsionsFP(mol)
            pkl1 = DbModule.binaryHolder(pairs.ToBinary())
            pkl2 = DbModule.binaryHolder(torsions.ToBinary())
            row = (molGuid, molId, pkl1, pkl2)
            pairRows.append(row)
        if options.doFingerprints:
            fp2 = FingerprintUtils.BuildRDKitFP(mol)
            pkl = DbModule.binaryHolder(fp2.ToBinary())
            row = (molGuid, molId, pkl)
            fpRows.append(row)
        if options.doLayered:
            words = LayeredOptions.GetWords(mol)
            row = [molGuid, molId] + words
            layeredRows.append(row)
        if options.doDescriptors:
            descrs = calc.CalcDescriptors(mol)
            row = [molGuid, molId]
            row.extend(descrs)
            descrRows.append(row)
        if options.doPharm2D:
            FingerprintUtils.sigFactory = sigFactory
            fp = FingerprintUtils.BuildPharm2DFP(mol)
            pkl = DbModule.binaryHolder(fp.ToBinary())
            row = (molGuid, molId, pkl)
            pharm2DRows.append(row)
        if options.doGobbi2D:
            FingerprintUtils.sigFactory = Gobbi_Pharm2D.factory
            fp = FingerprintUtils.BuildPharm2DFP(mol)
            pkl = DbModule.binaryHolder(fp.ToBinary())
            row = (molGuid, molId, pkl)
            gobbi2DRows.append(row)
        if options.doMorganFps:
            morgan = FingerprintUtils.BuildMorganFP(mol)
            pkl = DbModule.binaryHolder(morgan.ToBinary())
            row = (molGuid, molId, pkl)
            morganRows.append(row)

        if not i % 500:
            if len(pairRows):
                pairCurs.executemany("insert into %s values (?,?,?,?)" % options.pairTableName, pairRows)
                pairRows = []
                pairConn.Commit()
            if len(fpRows):
                fpCurs.executemany("insert into %s values (?,?,?)" % options.fpTableName, fpRows)
                fpRows = []
                fpConn.Commit()
            if len(layeredRows):
                fpCurs.executemany(
                    "insert into %s values (?,?,%s)" % (options.layeredTableName, layeredQs), layeredRows
                )
                layeredRows = []
                fpConn.Commit()
            if len(descrRows):
                descrCurs.executemany("insert into %s values (%s)" % (options.descrTableName, descrQuery), descrRows)
                descrRows = []
                descrConn.Commit()
            if len(pharm2DRows):
                fpCurs.executemany("insert into %s values (?,?,?)" % options.pharm2DTableName, pharm2DRows)
                pharm2DRows = []
                fpConn.Commit()
            if len(gobbi2DRows):
                fpCurs.executemany("insert into %s values (?,?,?)" % options.gobbi2DTableName, gobbi2DRows)
                gobbi2DRows = []
                fpConn.Commit()
            if len(morganRows):
                fpCurs.executemany("insert into %s values (?,?,?)" % options.morganFpTableName, morganRows)
                morganRows = []
                fpConn.Commit()

        if not options.silent and not i % 500:
            logger.info("  Done: %d" % (i))

    if len(pairRows):
        pairCurs.executemany("insert into %s values (?,?,?,?)" % options.pairTableName, pairRows)
        pairRows = []
        pairConn.Commit()
    if len(fpRows):
        fpCurs.executemany("insert into %s values (?,?,?)" % options.fpTableName, fpRows)
        fpRows = []
        fpConn.Commit()
    if len(layeredRows):
        fpCurs.executemany("insert into %s values (?,?,%s)" % (options.layeredTableName, layeredQs), layeredRows)
        layeredRows = []
        fpConn.Commit()
    if len(descrRows):
        descrCurs.executemany("insert into %s values (%s)" % (options.descrTableName, descrQuery), descrRows)
        descrRows = []
        descrConn.Commit()
    if len(pharm2DRows):
        fpCurs.executemany("insert into %s values (?,?,?)" % options.pharm2DTableName, pharm2DRows)
        pharm2DRows = []
        fpConn.Commit()
    if len(gobbi2DRows):
        fpCurs.executemany("insert into %s values (?,?,?)" % options.gobbi2DTableName, gobbi2DRows)
        gobbi2DRows = []
        fpConn.Commit()
    if len(morganRows):
        fpCurs.executemany("insert into %s values (?,?,?)" % options.morganFpTableName, morganRows)
        morganRows = []
        fpConn.Commit()

    if not options.silent:
        logger.info("Finished.")
Example #28
0
from rdkit import Chem
from rdkit import RDConfig
import time, cPickle, sys, gzip
from rdkit.RDLogger import logger
logger = logger()

logger.info('reading smarts')
qs = []
smas = []
for line in file(RDConfig.RDDataDir + '/SmartsLib/RLewis_smarts.txt',
                 'r').readlines():
    if line[0] == '#':
        continue
    line = line.split(' ')
    p = Chem.MolFromSmarts(line[0])
    if not p:
        print >> sys.stderr, line[0]
        continue
    smas.append(line[0])
    qs.append(p)

logger.info('reading target counts')
refFps = cPickle.loads(gzip.open('fps.1000.counts.pkl.gz', 'rb').read())

fps = []
logger.info('reading mols:')
ms = cPickle.loads(gzip.open('mols.1000.pkl.gz', 'rb').read())
t1 = time.time()
nFail = 0
for i, m in enumerate(ms):
    fp = [0] * len(qs)
Example #29
0
from rdkit import Chem
from rdkit import RDConfig
import time,cPickle,sys,gzip
from rdkit.RDLogger import logger
logger = logger()

logger.info('reading smarts')
qs = []
smas = []
for line in file(RDConfig.RDDataDir+'/SmartsLib/RLewis_smarts.txt','r').readlines():
    if line[0] == '#':
        continue
    line = line.split(' ')
    p = Chem.MolFromSmarts(line[0])
    if not p:
        print >>sys.stderr,line[0]
        continue
    smas.append(line[0])
    qs.append(p)

logger.info('reading target counts')
refFps = cPickle.loads(gzip.open('fps.1000.counts.pkl.gz','rb').read())

fps = []   
logger.info('reading mols:')
ms = cPickle.loads(gzip.open('mols.1000.pkl.gz','rb').read())
t1 = time.time()
nFail=0
for i,m in enumerate(ms):
    fp = [0]*len(qs)
    for j,q in enumerate(qs):
Example #30
0
# propField is the name of the property (from the SD file) you want to use
# as the "activity"
propField = 'chemical_shift_1'

# similarity threshold for a pair to be considered interesting.
# (i.e. pairs with a similarity below this value will not be
# added to the output.
similarityThreshold = 0.5

if __name__ == '__main__':
  suppl = Chem.SDMolSupplier(sys.argv[1])
  outF = file(sys.argv[2], 'w+')

  data = []
  logger.info('reading molecules and generating fingeprints')
  for i, mol in enumerate(suppl):
    if not mol:
      continue
    smi = Chem.MolToSmiles(mol, True)
    nm = mol.GetProp(nameField)
    property = float(mol.GetProp(propField))
    fp = GetMolFingerprint(mol, maxPathLength)
    data.append((nm, smi, property, fp))

  logger.info('  got %d molecules' % len(data))

  logger.info('calculating pairs')
  pairs = []
  for i in range(len(data)):
    for j in range(i + 1, len(data)):
Example #31
0
from chemgrams import get_arpa_vocab, KenLMDeepSMILESLanguageModel, DeepSMILESLanguageModelUtils, \
    LanguageModelMCTSWithPUCTTerminating
from chemgrams.qedscorer import QEDScorer

from rdkit.RDLogger import logger
from rdkit import rdBase
rdBase.DisableLog('rdApp.error')
rdBase.DisableLog('rdApp.warning')

logger = logger()
THIS_DIR = os.path.dirname(os.path.abspath(__file__))

if __name__ == '__main__':

    logger.info("loading language model...")

    vocab = get_arpa_vocab(
        '../resources/chemts_250k_deepsmiles_klm_6gram_190414.arpa')
    lm = KenLMDeepSMILESLanguageModel(
        '../resources/chemts_250k_deepsmiles_klm_6gram_190414.klm', vocab)

    num_simulations = 100000
    width = 24
    max_depth = 100
    start_state = ["<s>"]
    c = 5

    qedscorer = QEDScorer()

    all_smiles = {}
Example #32
0
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.RDLogger import logger
logger = logger()

tests = [1] * 1001
if len(sys.argv) > 1:
  tests = [0] * 1001
  tests[1] = 1
  for x in sys.argv[1:]:
    x = int(x)
    tests[x] = 1
ts = []
mols = []
lines = gzip.open('../Data/znp.50k.smi.gz', 'rt').readlines()
logger.info('mols from smiles')
nMols = 0
nBad = 0
t1 = time.time()
for line in lines:
  line = line.strip().split(' ')
  m = Chem.MolFromSmiles(line[0])
  if m:
    nMols += 1
    mols.append(m)
  else:
    nBad += 1

t2 = time.time()
logger.info('Results1: %.2f seconds, %d passed, %d failed' % (t2 - t1, nMols, nBad))
ts.append(t2 - t1)
Example #33
0
def RunSearch(options, queryFilename):
    global sigFactory
    if options.similarityType == 'AtomPairs':
        fpBuilder = FingerprintUtils.BuildAtomPairFP
        simMetric = DataStructs.DiceSimilarity
        dbName = os.path.join(options.dbDir, options.pairDbName)
        fpTableName = options.pairTableName
        fpColName = options.pairColName
    elif options.similarityType == 'TopologicalTorsions':
        fpBuilder = FingerprintUtils.BuildTorsionsFP
        simMetric = DataStructs.DiceSimilarity
        dbName = os.path.join(options.dbDir, options.torsionsDbName)
        fpTableName = options.torsionsTableName
        fpColName = options.torsionsColName
    elif options.similarityType == 'RDK':
        fpBuilder = FingerprintUtils.BuildRDKitFP
        simMetric = DataStructs.FingerprintSimilarity
        dbName = os.path.join(options.dbDir, options.fpDbName)
        fpTableName = options.fpTableName
        if not options.fpColName:
            options.fpColName = 'rdkfp'
        fpColName = options.fpColName
    elif options.similarityType == 'Pharm2D':
        fpBuilder = FingerprintUtils.BuildPharm2DFP
        simMetric = DataStructs.DiceSimilarity
        dbName = os.path.join(options.dbDir, options.fpDbName)
        fpTableName = options.pharm2DTableName
        if not options.fpColName:
            options.fpColName = 'pharm2dfp'
        fpColName = options.fpColName
        FingerprintUtils.sigFactory = BuildSigFactory(options)
    elif options.similarityType == 'Gobbi2D':
        from rdkit.Chem.Pharm2D import Gobbi_Pharm2D
        fpBuilder = FingerprintUtils.BuildPharm2DFP
        simMetric = DataStructs.TanimotoSimilarity
        dbName = os.path.join(options.dbDir, options.fpDbName)
        fpTableName = options.gobbi2DTableName
        if not options.fpColName:
            options.fpColName = 'gobbi2dfp'
        fpColName = options.fpColName
        FingerprintUtils.sigFactory = Gobbi_Pharm2D.factory
    elif options.similarityType == 'Morgan':
        fpBuilder = FingerprintUtils.BuildMorganFP
        simMetric = DataStructs.DiceSimilarity
        dbName = os.path.join(options.dbDir, options.morganFpDbName)
        fpTableName = options.morganFpTableName
        fpColName = options.morganFpColName

    extraArgs = {}
    if options.similarityMetric == 'tanimoto':
        simMetric = DataStructs.TanimotoSimilarity
    elif options.similarityMetric == 'dice':
        simMetric = DataStructs.DiceSimilarity
    elif options.similarityMetric == 'tversky':
        simMetric = DataStructs.TverskySimilarity
        extraArgs['tverskyA'] = options.tverskyA
        extraArgs['tverskyB'] = options.tverskyB

    if options.smilesQuery:
        mol = Chem.MolFromSmiles(options.smilesQuery)
        if not mol:
            logger.error('could not build query molecule from smiles "%s"' %
                         options.smilesQuery)
            sys.exit(-1)
        options.queryMol = mol
    elif options.smartsQuery:
        mol = Chem.MolFromSmarts(options.smartsQuery)
        if not mol:
            logger.error('could not build query molecule from smarts "%s"' %
                         options.smartsQuery)
            sys.exit(-1)
        options.queryMol = mol

    if options.outF == '-':
        outF = sys.stdout
    elif options.outF == '':
        outF = None
    else:
        outF = open(options.outF, 'w+')

    molsOut = False
    if options.sdfOut:
        molsOut = True
        if options.sdfOut == '-':
            sdfOut = sys.stdout
        else:
            sdfOut = open(options.sdfOut, 'w+')
    else:
        sdfOut = None
    if options.smilesOut:
        molsOut = True
        if options.smilesOut == '-':
            smilesOut = sys.stdout
        else:
            smilesOut = open(options.smilesOut, 'w+')
    else:
        smilesOut = None

    if queryFilename:
        try:
            tmpF = open(queryFilename, 'r')
        except IOError:
            logger.error('could not open query file %s' % queryFilename)
            sys.exit(1)

        if options.molFormat == 'smiles':
            func = GetMolsFromSmilesFile
        elif options.molFormat == 'sdf':
            func = GetMolsFromSDFile

        if not options.silent:
            msg = 'Reading query molecules'
            if fpBuilder: msg += ' and generating fingerprints'
            logger.info(msg)
        probes = []
        i = 0
        nms = []
        for nm, smi, mol in func(queryFilename, None, options.nameProp):
            i += 1
            nms.append(nm)
            if not mol:
                logger.error('query molecule %d could not be built' % (i))
                probes.append((None, None))
                continue
            if fpBuilder:
                probes.append((mol, fpBuilder(mol)))
            else:
                probes.append((mol, None))
            if not options.silent and not i % 1000:
                logger.info("  done %d" % i)
    else:
        probes = None

    conn = None
    idName = options.molIdName
    ids = None
    names = None
    molDbName = os.path.join(options.dbDir, options.molDbName)
    molIdName = options.molIdName
    mConn = DbConnect(molDbName)
    cns = [(x.lower(), y)
           for x, y in mConn.GetColumnNamesAndTypes('molecules')]
    idCol, idTyp = cns[0]
    if options.propQuery or options.queryMol:
        conn = DbConnect(molDbName)
        curs = conn.GetCursor()
        if options.queryMol:
            if not options.silent: logger.info('Doing substructure query')
            if options.propQuery:
                where = 'where %s' % options.propQuery
            else:
                where = ''
            if not options.silent:
                curs.execute('select count(*) from molecules %(where)s' %
                             locals())
                nToDo = curs.fetchone()[0]

            join = ''
            doSubstructFPs = False
            fpDbName = os.path.join(options.dbDir, options.fpDbName)
            if os.path.exists(fpDbName) and not options.negateQuery:
                curs.execute("attach database '%s' as fpdb" % (fpDbName))
                try:
                    curs.execute('select * from fpdb.%s limit 1' %
                                 options.layeredTableName)
                except:
                    pass
                else:
                    doSubstructFPs = True
                    join = 'join fpdb.%s using (%s)' % (
                        options.layeredTableName, idCol)
                    query = LayeredOptions.GetQueryText(options.queryMol)
                    if query:
                        if not where:
                            where = 'where'
                        else:
                            where += ' and'
                        where += ' ' + query

            cmd = 'select %(idCol)s,molpkl from molecules %(join)s %(where)s' % locals(
            )
            curs.execute(cmd)
            row = curs.fetchone()
            nDone = 0
            ids = []
            while row:
                id, molpkl = row
                if not options.zipMols:
                    m = _molFromPkl(molpkl)
                else:
                    m = Chem.Mol(zlib.decompress(molpkl))
                matched = m.HasSubstructMatch(options.queryMol)
                if options.negateQuery:
                    matched = not matched
                if matched:
                    ids.append(id)
                nDone += 1
                if not options.silent and not nDone % 500:
                    if not doSubstructFPs:
                        logger.info(
                            '  searched %d (of %d) molecules; %d hits so far' %
                            (nDone, nToDo, len(ids)))
                    else:
                        logger.info(
                            '  searched through %d molecules; %d hits so far' %
                            (nDone, len(ids)))
                row = curs.fetchone()
            if not options.silent and doSubstructFPs and nToDo:
                nFiltered = nToDo - nDone
                logger.info(
                    '   Fingerprint screenout rate: %d of %d (%%%.2f)' %
                    (nFiltered, nToDo, 100. * nFiltered / nToDo))

        elif options.propQuery:
            if not options.silent: logger.info('Doing property query')
            propQuery = options.propQuery.split(';')[0]
            curs.execute(
                'select %(idCol)s from molecules where %(propQuery)s' %
                locals())
            ids = [x[0] for x in curs.fetchall()]
        if not options.silent:
            logger.info('Found %d molecules matching the query' % (len(ids)))

    t1 = time.time()
    if probes:
        if not options.silent: logger.info('Finding Neighbors')
        conn = DbConnect(dbName)
        cns = conn.GetColumnNames(fpTableName)
        curs = conn.GetCursor()

        if ids:
            ids = [(x, ) for x in ids]
            curs.execute(
                'create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' %
                locals())
            curs.executemany('insert into _tmpTbl values (?)', ids)
            join = 'join  _tmpTbl using (%(idCol)s)' % locals()
        else:
            join = ''

        if cns[0].lower() != idCol.lower():
            # backwards compatibility to the days when mol tables had a guid and
            # the fps tables did not:
            curs.execute("attach database '%(molDbName)s' as mols" % locals())
            curs.execute("""
  select %(idCol)s,%(fpColName)s from %(fpTableName)s join
      (select %(idCol)s,%(molIdName)s from mols.molecules %(join)s)
    using (%(molIdName)s)
""" % (locals()))
        else:
            curs.execute(
                'select %(idCol)s,%(fpColName)s from %(fpTableName)s %(join)s'
                % locals())

        def poolFromCurs(curs, similarityMethod):
            row = curs.fetchone()
            while row:
                id, pkl = row
                fp = DepickleFP(pkl, similarityMethod)
                yield (id, fp)
                row = curs.fetchone()

        topNLists = GetNeighborLists(probes,
                                     options.topN,
                                     poolFromCurs(curs,
                                                  options.similarityType),
                                     simMetric=simMetric,
                                     simThresh=options.simThresh,
                                     **extraArgs)
        uniqIds = set()
        nbrLists = {}
        for i, nm in enumerate(nms):
            topNLists[i].reverse()
            scores = topNLists[i].GetPts()
            nbrNames = topNLists[i].GetExtras()
            nbrs = []
            for j, nbrGuid in enumerate(nbrNames):
                if nbrGuid is None:
                    break
                else:
                    uniqIds.add(nbrGuid)
                    nbrs.append((nbrGuid, scores[j]))
            nbrLists[(i, nm)] = nbrs
        t2 = time.time()
        if not options.silent:
            logger.info('The search took %.1f seconds' % (t2 - t1))

        if not options.silent: logger.info('Creating output')

        curs = mConn.GetCursor()
        ids = list(uniqIds)

        ids = [(x, ) for x in ids]
        curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' %
                     locals())
        curs.executemany('insert into _tmpTbl values (?)', ids)
        curs.execute(
            'select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)'
            % locals())
        nmDict = {}
        for guid, id in curs.fetchall():
            nmDict[guid] = str(id)

        ks = list(nbrLists.keys())
        ks.sort()
        if not options.transpose:
            for i, nm in ks:
                nbrs = nbrLists[(i, nm)]
                nbrTxt = options.outputDelim.join([nm] + [
                    '%s%s%.3f' % (nmDict[id], options.outputDelim, score)
                    for id, score in nbrs
                ])
                if outF: print(nbrTxt, file=outF)
        else:
            labels = [
                '%s%sSimilarity' % (x[1], options.outputDelim) for x in ks
            ]
            if outF: print(options.outputDelim.join(labels), file=outF)
            for i in range(options.topN):
                outL = []
                for idx, nm in ks:
                    nbr = nbrLists[(idx, nm)][i]
                    outL.append(nmDict[nbr[0]])
                    outL.append('%.3f' % nbr[1])
                if outF: print(options.outputDelim.join(outL), file=outF)
    else:
        if not options.silent: logger.info('Creating output')
        curs = mConn.GetCursor()
        ids = [(x, ) for x in set(ids)]
        curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' %
                     locals())
        curs.executemany('insert into _tmpTbl values (?)', ids)
        molIdName = options.molIdName
        curs.execute(
            'select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)'
            % locals())
        nmDict = {}
        for guid, id in curs.fetchall():
            nmDict[guid] = str(id)
        if outF: print('\n'.join(nmDict.values()), file=outF)
    if molsOut and ids:
        molDbName = os.path.join(options.dbDir, options.molDbName)
        cns = [x.lower() for x in mConn.GetColumnNames('molecules')]
        if cns[-1] != 'molpkl':
            cns.remove('molpkl')
            cns.append('molpkl')

        curs = mConn.GetCursor()
        #curs.execute('create temporary table _tmpTbl (guid integer)'%locals())
        #curs.executemany('insert into _tmpTbl values (?)',ids)
        cnText = ','.join(cns)
        curs.execute(
            'select %(cnText)s from molecules join _tmpTbl using (%(idCol)s)' %
            locals())

        row = curs.fetchone()
        molD = {}
        while row:
            row = list(row)
            m = _molFromPkl(row[-1])
            guid = row[0]
            nm = nmDict[guid]
            if sdfOut:
                m.SetProp('_Name', nm)
                print(Chem.MolToMolBlock(m), file=sdfOut)
                for i in range(1, len(cns) - 1):
                    pn = cns[i]
                    pv = str(row[i])
                    print >> sdfOut, '> <%s>\n%s\n' % (pn, pv)
                print('$$$$', file=sdfOut)
            if smilesOut:
                smi = Chem.MolToSmiles(m, options.chiralSmiles)
            if smilesOut:
                print('%s %s' % (smi, str(row[1])), file=smilesOut)
            row = curs.fetchone()
    if not options.silent: logger.info('Done!')
Example #34
0
from rdkit.Chem import AllChem
from rdkit.RDLogger import logger

logger = logger()

tests = [1] * 1001
if len(sys.argv) > 1:
    tests = [0] * 1001
    tests[1] = 1
    for x in sys.argv[1:]:
        x = int(x)
        tests[x] = 1
ts = []
mols = []
lines = gzip.open("../Data/znp.50k.smi.gz", "rb").readlines()
logger.info("mols from smiles")
nMols = 0
nBad = 0
t1 = time.time()
for line in lines:
    line = line.strip().split(" ")
    m = Chem.MolFromSmiles(line[0])
    if m:
        nMols += 1
        mols.append(m)
    else:
        nBad += 1

t2 = time.time()
logger.info("Results1: %.2f seconds, %d passed, %d failed" % (t2 - t1, nMols, nBad))
ts.append(t2 - t1)
Example #35
0
parser = OptionParser("distance predict", version='%prog', option_class=MyOption)
parser.add_option('--maxPathLength', '--max', default=8, type=int,
                  help='maximum length path for the fingerprint')
parser.add_option('--similarityThreshold', '--sim', default=[0.9], type='floatlist',
                  help='threshold for similarity')
parser.add_option('--numNeighbors', '--num', '-n', '-k', default=50, type=int,
                  help='number of neighbors to consider')
parser.add_option('--neighborsFile', '--nbrs', default='',
                  help='name of an output file to hold the neighbor lists')
parser.add_option('--scan', default=False, action="store_true")

if __name__ == '__main__':
  options, args = parser.parse_args()
  outF = file(args[-1], 'w+')

  logger.info('reading training molecules and generating fingerprints')
  suppl = Chem.SDMolSupplier(args[0])
  train = []
  for i, mol in enumerate(suppl):
    if not mol:
      continue
    smi = Chem.MolToSmiles(mol, True)
    nm = mol.GetProp(nameField)
    property = float(mol.GetProp(propField))
    fp = GetMolFingerprint(mol, options.maxPathLength)
    train.append((nm, smi, fp, property))
  logger.info('  got %d molecules' % len(train))

  if len(args) > 2:
    suppl = Chem.SDMolSupplier(args[1])
    haveTest = True
Example #36
0

logger = logger()

tests = [1] * 1001
if len(sys.argv) > 1:
    tests = [0] * 1001
    for x in sys.argv[1:]:
        x = int(x)
        tests[x] = 1
ts = []
mols = []

if tests[0]:
    lines = gzip.open(data('znp.50k.smi.gz'), 'rt').readlines()
    logger.info('mols from smiles')
    nMols = 0
    nBad = 0
    t1 = time.time()
    for line in lines:
        line = line.strip().split(' ')
        m = Chem.MolFromSmiles(line[0])
        if m:
            nMols += 1
            mols.append(m)
        else:
            nBad += 1

    t2 = time.time()
    logger.info('Results1: %.2f seconds, %d passed, %d failed' %
                (t2 - t1, nMols, nBad))
Example #37
0
from rdkit.Chem import Recap
from rdkit.RDLogger import logger

logger = logger()

tests = [1] * 1001
if len(sys.argv) > 1:
    tests = [0] * 1001
    tests[1] = 1
    for x in sys.argv[1:]:
        x = int(x)
        tests[x] = 1
ts = []

sdData = gzip.open("../Data/mols.1000.sdf.gz").read()
logger.info("mols from sdf")
suppl = Chem.SDMolSupplier()
suppl.SetData(sdData)
mols = []
nMols = 0
nBad = 0
t1 = time.time()
for m in suppl:
    if m:
        nMols += 1
        mols.append(m)
    else:
        nBad += 1
t2 = time.time()
logger.info("Results1: %.2f seconds, %d passed, %d failed" % (t2 - t1, nMols, nBad))
ts.append(t2 - t1)