Beispiel #1
0
  def test1SDSupplier(self):
    fileN = os.path.join(RDConfig.RDCodeDir, 'VLib', 'NodeLib', 'test_data', 'NCI_aids.10.sdf')

    suppl = Chem.SDMolSupplier(fileN)
    ms = [x for x in suppl]
    self.assertEqual(len(ms), 10)

    # test repeating:
    ms = [x for x in suppl]
    self.assertEqual(len(ms), 10)

    # test reset:
    suppl.reset()
    m = next(suppl)
    self.assertEqual(m.GetProp('_Name'), '48')
    self.assertEqual(m.GetProp('NSC'), '48')
    self.assertEqual(m.GetProp('CAS_RN'), '15716-70-8')
    m = next(suppl)
    self.assertEqual(m.GetProp('_Name'), '78')
    self.assertEqual(m.GetProp('NSC'), '78')
    self.assertEqual(m.GetProp('CAS_RN'), '6290-84-2')

    suppl.reset()
    for _ in range(10):
      m = next(suppl)

    with self.assertRaises(StopIteration):
      m = next(suppl)
    def test2SmilesSupplier(self):
        fileN = os.path.join(RDConfig.RDCodeDir, 'VLib', 'NodeLib',
                             'test_data', 'pgp_20.txt')

        suppl = Chem.SmilesMolSupplier(fileN,
                                       delimiter='\t',
                                       smilesColumn=2,
                                       nameColumn=1,
                                       titleLine=1)
        ms = [x for x in suppl]
        assert len(ms) == 20

        # test repeating:
        ms = [x for x in suppl]
        assert len(ms) == 20
        # test reset:
        suppl.reset()
        m = next(suppl)
        assert m.GetProp('_Name') == 'ALDOSTERONE'
        assert m.GetProp('ID') == 'RD-PGP-0001'
        m = next(suppl)
        assert m.GetProp('_Name') == 'AMIODARONE'
        assert m.GetProp('ID') == 'RD-PGP-0002'
        suppl.reset()
        for i in range(20):
            m = next(suppl)
        try:
            m = next(suppl)
        except StopIteration:
            ok = 1
        else:
            ok = 0
        assert ok
Beispiel #3
0
def SupplierFromDetails(details):
  from rdkit.VLib.NodeLib.DbMolSupply import DbMolSupplyNode
  from rdkit.VLib.NodeLib.SmilesSupply import SmilesSupplyNode

  if details.dbName:
    conn = DbConnect(details.dbName, details.tableName)
    suppl = DbMolSupplyNode(conn.GetData())
  else:
    suppl = SmilesSupplyNode(details.inFileName, delim=details.delim, nameColumn=details.nameCol,
                             smilesColumn=details.smiCol, titleLine=details.hasTitle)
    if isinstance(details.actCol, int):
      suppl.reset()
      m = next(suppl)
      actName = m.GetPropNames()[details.actCol]
      details.actCol = actName
    if isinstance(details.nameCol, int):
      suppl.reset()
      m = next(suppl)
      nameName = m.GetPropNames()[details.nameCol]
      details.nameCol = nameName
      suppl.reset()
  if isinstance(details.actCol, int):
    suppl.reset()
    m = next(suppl)
    actName = m.GetPropNames()[details.actCol]
    details.actCol = actName
  if isinstance(details.nameCol, int):
    suppl.reset()
    m = next(suppl)
    nameName = m.GetPropNames()[details.nameCol]
    details.nameCol = nameName
    suppl.reset()
  return suppl
Beispiel #4
0
  def test2SmilesSupplier(self):
    fileN = os.path.join(RDConfig.RDCodeDir,'VLib','NodeLib','test_data','pgp_20.txt')
    
    suppl = Chem.SmilesMolSupplier(fileN,delimiter='\t',smilesColumn=2,
                                   nameColumn=1,titleLine=1)
    ms = [x for x in suppl]
    assert len(ms)==20

    # test repeating:
    ms = [x for x in suppl]
    assert len(ms)==20
    # test reset:
    suppl.reset()
    m = next(suppl)
    assert m.GetProp('_Name')=='ALDOSTERONE'
    assert m.GetProp('ID')=='RD-PGP-0001'
    m = next(suppl)
    assert m.GetProp('_Name')=='AMIODARONE'
    assert m.GetProp('ID')=='RD-PGP-0002'
    suppl.reset()
    for i in range(20):
      m = next(suppl)
    try:
      m = next(suppl)
    except StopIteration:
      ok=1
    else:
      ok=0
    assert ok  
Beispiel #5
0
  def test1SDSupplier(self):
    fileN = os.path.join(RDConfig.RDCodeDir,'VLib','NodeLib','test_data','NCI_aids.10.sdf')
    
    suppl = Chem.SDMolSupplier(fileN)
    ms = [x for x in suppl]
    assert len(ms)==10

    # test repeating:
    ms = [x for x in suppl]
    assert len(ms)==10
    # test reset:
    suppl.reset()
    m = next(suppl)
    assert m.GetProp('_Name')=='48'
    assert m.GetProp('NSC')=='48'
    assert m.GetProp('CAS_RN')=='15716-70-8'
    m = next(suppl)
    assert m.GetProp('_Name')=='78'
    assert m.GetProp('NSC')=='78'
    assert m.GetProp('CAS_RN')=='6290-84-2'

    suppl.reset()
    for i in range(10):
      m = next(suppl)
    try:
      m = next(suppl)
    except StopIteration:
      ok=1
    else:
      ok=0
    assert ok  
    def test1SDSupplier(self):
        fileN = os.path.join(RDConfig.RDCodeDir, 'VLib', 'NodeLib',
                             'test_data', 'NCI_aids.10.sdf')

        suppl = Chem.SDMolSupplier(fileN)
        ms = [x for x in suppl]
        assert len(ms) == 10

        # test repeating:
        ms = [x for x in suppl]
        assert len(ms) == 10
        # test reset:
        suppl.reset()
        m = next(suppl)
        assert m.GetProp('_Name') == '48'
        assert m.GetProp('NSC') == '48'
        assert m.GetProp('CAS_RN') == '15716-70-8'
        m = next(suppl)
        assert m.GetProp('_Name') == '78'
        assert m.GetProp('NSC') == '78'
        assert m.GetProp('CAS_RN') == '6290-84-2'

        suppl.reset()
        for i in range(10):
            m = next(suppl)
        try:
            m = next(suppl)
        except StopIteration:
            ok = 1
        else:
            ok = 0
        assert ok
Beispiel #7
0
 def testOrderBug(self):
   sdFile = os.path.join(RDConfig.RDCodeDir, 'Chem', 'Pharm2D', 'test_data', 'orderBug.sdf')
   suppl = Chem.SDMolSupplier(sdFile)
   m1 = next(suppl)
   m2 = next(suppl)
   sig1 = Generate.Gen2DFingerprint(m1, self.factory)
   sig2 = Generate.Gen2DFingerprint(m2, self.factory)
   self.assertEqual(sig1, sig2)
Beispiel #8
0
 def testOrderBug(self):
     sdFile = os.path.join(RDConfig.RDCodeDir, 'Chem', 'Pharm2D',
                           'test_data', 'orderBug.sdf')
     suppl = Chem.SDMolSupplier(sdFile)
     m1 = next(suppl)
     m2 = next(suppl)
     sig1 = Generate.Gen2DFingerprint(m1, self.factory)
     sig2 = Generate.Gen2DFingerprint(m2, self.factory)
     self.assertEqual(sig1, sig2)
    def test_SmilesReaderIndex(self):
        # tests lazy reads
        supp = Chem.SmilesMolSupplierFromText('\n'.join(self.smis), ',', 0, -1,
                                              0)
        for i in range(4):
            self.assertMolecule(next(supp), i)

        i = len(supp) - 1
        self.assertMolecule(supp[i], i)

        # Use in a list comprehension
        ms = [Chem.MolToSmiles(mol) for mol in supp]
        self.assertEqual(ms, self.smis)

        self.assertEqual(len(supp), self.nMolecules, 'bad supplier length')

        # Despite iterating through the whole supplier, we can still access by index
        i = self.nMolecules - 3
        self.assertMolecule(supp[i - 1], i, msg='back index: ')

        with self.assertRaises(IndexError):
            _ = supp[self.nMolecules]  # out of bound read must fail

        # and we can access with negative numbers
        mol1 = supp[len(supp) - 1]
        mol2 = supp[-1]
        self.assertEqual(Chem.MolToSmiles(mol1), Chem.MolToSmiles(mol2))
  def test_SmilesReaderIndex(self):
    # tests lazy reads
    supp = Chem.SmilesMolSupplierFromText('\n'.join(self.smis), ',', 0, -1, 0)
    for i in range(4):
      self.assertMolecule(next(supp), i)

    i = len(supp) - 1
    self.assertMolecule(supp[i], i)

    # Use in a list comprehension
    ms = [Chem.MolToSmiles(mol) for mol in supp]
    self.assertEqual(ms, self.smis)

    self.assertEqual(len(supp), self.nMolecules, 'bad supplier length')

    # Despite iterating through the whole supplier, we can still access by index
    i = self.nMolecules - 3
    self.assertMolecule(supp[i - 1], i, msg='back index: ')

    with self.assertRaises(IndexError):
      _ = supp[self.nMolecules]  # out of bound read must fail

    # and we can access with negative numbers
    mol1 = supp[len(supp) - 1]
    mol2 = supp[-1]
    self.assertEqual(Chem.MolToSmiles(mol1), Chem.MolToSmiles(mol2))
  def test1LazyReader(self):
    " tests lazy reads """
    supp = Chem.SmilesMolSupplierFromText('\n'.join(self.smis),',',0,-1,0)
    for i in range(4):
      m = next(supp)
      assert m,'read %d failed'%i
      assert m.GetNumAtoms(),'no atoms in mol %d'%i
    i = len(supp)-1
    m = supp[i]
    assert m,'read %d failed'%i
    assert m.GetNumAtoms(),'no atoms in mol %d'%i

    ms = [x for x in supp]
    for i in range(len(supp)):
      m = ms[i]
      if m:
        ms[i] = Chem.MolToSmiles(m)

    
    l = len(supp)
    assert l == len(self.smis),'bad supplier length: %d'%(l)

    i = len(self.smis)-3
    m = supp[i-1]
    assert m,'back index %d failed'%i
    assert m.GetNumAtoms(),'no atoms in mol %d'%i
    
    try:
      m = supp[len(self.smis)]
    except:
      fail = 1
    else:
      fail = 0
    assert fail,'out of bound read did not fail'
Beispiel #12
0
 def _testStreamRoundtrip(self):
   inD = open(self.fName).read()
   supp = Chem.SDMolSupplier(self.fName)
   outName = tempfile.mktemp('.sdf')
   writer = Chem.SDWriter(outName)
   m1 = next(supp)
   for m in supp:
     writer.write(m)
   writer.flush()
   writer = None
   outD = open(outName,'r').read()
   try:
     os.unlink(outName)
   except Exception:
     import time
     time.sleep(1)
     try:
       os.unlink(outName)
     except Exception:
       pass
   assert inD.count('$$$$')==outD.count('$$$$'),'bad nMols in output'
   io = StringIO(outD)
   supp = Chem.SDMolSupplier(stream=io)
   outD2 = supp.Dump()
   assert outD2.count('$$$$')==len(supp),'bad nMols in output'
   assert outD2.count('$$$$')==outD.count('$$$$'),'bad nMols in output'
   assert outD2==outD,'bad outd'
Beispiel #13
0
 def _testStreamRoundtrip(self):
   inD = open(self.fName).read()
   supp = Chem.SDMolSupplier(self.fName)
   outName = tempfile.mktemp('.sdf')
   writer = Chem.SDWriter(outName)
   m1 = next(supp)
   for m in supp:
     writer.write(m)
   writer.flush()
   writer = None
   outD = open(outName,'r').read()
   try:
     os.unlink(outName)
   except:
     import time
     time.sleep(1)
     try:
       os.unlink(outName)
     except:
       pass
   assert inD.count('$$$$')==outD.count('$$$$'),'bad nMols in output'
   io = StringIO(outD)
   supp = Chem.SDMolSupplier(stream=io)
   outD2 = supp.Dump()
   assert outD2.count('$$$$')==len(supp),'bad nMols in output'
   assert outD2.count('$$$$')==outD.count('$$$$'),'bad nMols in output'
   assert outD2==outD,'bad outd'
Beispiel #14
0
  def _testReader(self):
    " tests reads using a file name "
    supp = Chem.SDMolSupplier(self.fName)
    for i in range(10):
      m = next(supp)
      assert m,'read %d failed'%i
      assert m.GetNumAtoms(),'no atoms in mol %d'%i
    i = 100
    m = supp[i-1]
    assert m,'read %d failed'%i
    assert m.GetNumAtoms(),'no atoms in mol %d'%i
    
    l = len(supp)
    assert l == 200,'bad supplier length: %d'%(l)

    i = 12
    m = supp[i-1]
    assert m,'back index %d failed'%i
    assert m.GetNumAtoms(),'no atoms in mol %d'%i


    try:
      m = supp[201]
    except IndexError:
      fail = 1
    else:
      fail = 0
    assert fail,'out of bound read did not fail'
Beispiel #15
0
  def _testReader(self):
    " tests reads using a file name "
    supp = Chem.SDMolSupplier(self.fName)
    for i in range(10):
      m = next(supp)
      assert m,'read %d failed'%i
      assert m.GetNumAtoms(),'no atoms in mol %d'%i
    i = 100
    m = supp[i-1]
    assert m,'read %d failed'%i
    assert m.GetNumAtoms(),'no atoms in mol %d'%i
    
    l = len(supp)
    assert l == 200,'bad supplier length: %d'%(l)

    i = 12
    m = supp[i-1]
    assert m,'back index %d failed'%i
    assert m.GetNumAtoms(),'no atoms in mol %d'%i


    try:
      m = supp[201]
    except IndexError:
      fail = 1
    else:
      fail = 0
    assert fail,'out of bound read did not fail'
Beispiel #16
0
 def test_Writer(self):
   " tests writes using a file name "
   with open(self.fName,'r') as inf:
     inD = inf.read()
   supp = Chem.SDMolSupplier(self.fName)
   outName = tempfile.mktemp('.sdf')
   writer = Chem.SDWriter(outName)
   m1 = next(supp)
   writer.SetProps(m1.GetPropNames())
   for m in supp:
     writer.write(m)
   writer.flush()
   writer = None
   with open(outName,'r') as inf:
     outD = inf.read()
   try:
     os.unlink(outName)
   except:
     import time
     time.sleep(1)
     try:
       os.unlink(outName)
     except:
       pass
   assert inD.count('$$$$')==outD.count('$$$$'),'bad nMols in output'
Beispiel #17
0
    def test_SDMolSupplier(self):
        # tests reads using a file name (file contains 200 molecules)
        supp = Chem.SDMolSupplier(self.fName)

        # Can use as an iterator
        for i in range(10):
            mol = next(supp)
            self.assertMolecule(mol, i)

        # Can access directly
        i = 100
        mol = supp[i - 1]
        self.assertMolecule(mol, i)

        # We can access the number of molecules
        self.assertEqual(len(supp), self.nMolecules, 'bad supplier length')

        # We know the number and can still access directly
        i = 12
        mol = supp[i - 1]
        self.assertMolecule(mol, i)

        # Get an exception if we access an invalid number
        with self.assertRaises(IndexError):
            _ = supp[self.nMolecules]  # out of bound read must fail

        # and we can access with negative numbers
        mol1 = supp[len(supp) - 1]
        mol2 = supp[-1]
        self.assertEqual(Chem.MolToSmiles(mol1), Chem.MolToSmiles(mol2))
  def test1LazyReader(self):
    " tests lazy reads """
    supp = Chem.SmilesMolSupplierFromText('\n'.join(self.smis),',',0,-1,0)
    for i in range(4):
      m = next(supp)
      assert m,'read %d failed'%i
      assert m.GetNumAtoms(),'no atoms in mol %d'%i
    i = len(supp)-1
    m = supp[i]
    assert m,'read %d failed'%i
    assert m.GetNumAtoms(),'no atoms in mol %d'%i

    ms = [x for x in supp]
    for i in range(len(supp)):
      m = ms[i]
      if m:
        ms[i] = Chem.MolToSmiles(m)

    
    l = len(supp)
    assert l == len(self.smis),'bad supplier length: %d'%(l)

    i = len(self.smis)-3
    m = supp[i-1]
    assert m,'back index %d failed'%i
    assert m.GetNumAtoms(),'no atoms in mol %d'%i

    with self.assertRaisesRegexp(Exception, ""):
      m = supp[len(self.smis)] # out of bound read must fail
Beispiel #19
0
 def test_Writer(self):
   " tests writes using a file name "
   with open(self.fName,'r') as inf:
     inD = inf.read()
   supp = Chem.SDMolSupplier(self.fName)
   outName = tempfile.mktemp('.sdf')
   writer = Chem.SDWriter(outName)
   m1 = next(supp)
   writer.SetProps(m1.GetPropNames())
   for m in supp:
     writer.write(m)
   writer.flush()
   # The writer does not have an explicit "close()" so need to
   # let the garbage collector kick in to close the file.
   writer = None
   with open(outName,'r') as inf:
     outD = inf.read()
   # The file should be closed, but if it isn't, and this
   # is Windows, then the unlink() can fail. Wait and try again.
   try:
     os.unlink(outName)
   except Exception:
     import time
     time.sleep(1)
     try:
       os.unlink(outName)
     except Exception:
       pass
   self.assertEqual(inD.count('$$$$'),outD.count('$$$$'),'bad nMols in output')
Beispiel #20
0
    def test_SDWriter(self):
        # tests writes using a file name
        supp = Chem.SDMolSupplier(self.fName)
        _, outName = tempfile.mkstemp('.sdf')
        writer = Chem.SDWriter(outName)
        m1 = next(supp)
        writer.SetProps(m1.GetPropNames())
        for m in supp:
            writer.write(m)
        writer.flush()
        writer.close()

        # The writer does not have an explicit "close()" so need to
        # let the garbage collector kick in to close the file.
        writer = None
        with open(outName, 'r') as inf:
            outD = inf.read()
        # The file should be closed, but if it isn't, and this
        # is Windows, then the unlink() can fail. Wait and try again.
        try:
            os.unlink(outName)
        except Exception:
            import time
            time.sleep(1)
            try:
                os.unlink(outName)
            except Exception:
                pass
        self.assertEqual(self.nMolecules, outD.count('$$$$'),
                         'bad nMols in output')
Beispiel #21
0
def ScoreFromLists(bitLists, suppl, catalog, maxPts=-1, actName='', acts=None, nActs=2,
                   reportFreq=10):
  """  similar to _ScoreMolecules()_, but uses pre-calculated bit lists
    for the molecules (this speeds things up a lot)


    **Arguments**

      - bitLists: sequence of on bit sequences for the input molecules

      - suppl: the input supplier (we read activities from here)

      - catalog: the FragmentCatalog

      - maxPts: (optional) the maximum number of molecules to be
        considered

      - actName: (optional) the name of the molecule's activity property.
        If this is not provided, the molecule's last property will be used.

      - nActs: (optional) number of possible activity values

      - reportFreq: (optional) how often to display status information

    **Returns**

       the results table (a 3D array of ints nBits x 2 x nActs)

  """
  nBits = catalog.GetFPLength()
  if maxPts > 0:
    nPts = maxPts
  else:
    nPts = len(bitLists)
  resTbl = numpy.zeros((nBits, 2, nActs), numpy.int)
  if not actName and not acts:
    actName = suppl[0].GetPropNames()[-1]
  suppl.reset()
  for i in range(1, nPts + 1):
    mol = next(suppl)
    if not acts:
      act = int(mol.GetProp(actName))
    else:
      act = acts[i - 1]
    if i and not i % reportFreq:
      message('Done %d of %d\n' % (i, nPts))
    ids = set()
    for id_ in bitLists[i - 1]:
      ids.add(id_ - 1)
    for j in range(nBits):
      resTbl[j, 0, act] += 1
    for id_ in ids:
      resTbl[id_, 0, act] -= 1
      resTbl[id_, 1, act] += 1
  return resTbl
Beispiel #22
0
  def test2SmilesSupplier(self):
    fileN = os.path.join(RDConfig.RDCodeDir, 'VLib', 'NodeLib', 'test_data', 'pgp_20.txt')

    suppl = Chem.SmilesMolSupplier(fileN, delimiter='\t', smilesColumn=2, nameColumn=1, titleLine=1)
    ms = [x for x in suppl]
    self.assertEqual(len(ms), 20)

    # test repeating:
    ms = [x for x in suppl]
    self.assertEqual(len(ms), 20)
    # test reset:
    suppl.reset()
    m = next(suppl)
    self.assertEqual(m.GetProp('_Name'), 'ALDOSTERONE')
    self.assertEqual(m.GetProp('ID'), 'RD-PGP-0001')
    m = next(suppl)
    self.assertEqual(m.GetProp('_Name'), 'AMIODARONE')
    self.assertEqual(m.GetProp('ID'), 'RD-PGP-0002')
    suppl.reset()
    for _ in range(20):
      m = next(suppl)
    with self.assertRaises(StopIteration):
      m = next(suppl)
Beispiel #23
0
    def test7ConstrainedEmbedding(self):
        ofile = os.path.join(RDConfig.RDBaseDir, "Code", "GraphMol", "DistGeomHelpers", "test_data", "constrain1.sdf")
        suppl = Chem.SDMolSupplier(ofile)
        ref = next(suppl)
        probe = copy.deepcopy(ref)

        cMap = {}
        for i in range(5):
            cMap[i] = ref.GetConformer().GetAtomPosition(i)
        ci = rdDistGeom.EmbedMolecule(probe, coordMap=cMap, randomSeed=23)
        self.assertTrue(ci > -1)
        algMap = list(zip(range(5), range(5)))
        ssd = rdMolAlign.AlignMol(probe, ref, atomMap=algMap)
        self.assertTrue(ssd < 0.1)
Beispiel #24
0
    def test7ConstrainedEmbedding(self):
        ofile = os.path.join(RDConfig.RDBaseDir, 'Code', 'GraphMol',
                             'DistGeomHelpers', 'test_data', 'constrain1.sdf')
        suppl = Chem.SDMolSupplier(ofile)
        ref = next(suppl)
        probe = copy.deepcopy(ref)

        cMap = {}
        for i in range(5):
            cMap[i] = ref.GetConformer().GetAtomPosition(i)
        ci = rdDistGeom.EmbedMolecule(probe, coordMap=cMap, randomSeed=23)
        self.assertTrue(ci > -1)
        algMap = list(zip(range(5), range(5)))
        ssd = rdMolAlign.AlignMol(probe, ref, atomMap=algMap)
        self.assertTrue(ssd < 0.1)
Beispiel #25
0
 def _nextMatch(self):
   """ *Internal use only* """
   done = 0
   res = None
   sim = 0
   while not done:
     # this is going to crap out when the data source iterator finishes,
     #  that's how we stop when no match is found
     obj = six.next(self.dataIter)
     fp = self.fingerprinter(obj)
     sim = DataStructs.FingerprintSimilarity(fp, self.probe, self.metric)
     if sim >= self.threshold:
       res = obj
       done = 1
   return sim, res
Beispiel #26
0
def RecapDecompose(mol,
                   allNodes=None,
                   minFragmentSize=0,
                   onlyUseReactions=None):
    """ returns the recap decomposition for a molecule """
    mSmi = Chem.MolToSmiles(mol, 1)

    if allNodes is None:
        allNodes = {}
    if mSmi in allNodes:
        return allNodes[mSmi]

    res = RecapHierarchyNode(mol)
    res.smiles = mSmi
    activePool = {mSmi: res}
    allNodes[mSmi] = res
    while activePool:
        nSmi = next(iterkeys(activePool))
        node = activePool.pop(nSmi)
        if not node.mol: continue
        for rxnIdx, reaction in enumerate(reactions):
            if onlyUseReactions and rxnIdx not in onlyUseReactions:
                continue
            #print '  .',nSmi
            #print '         !!!!',rxnIdx,nSmi,reactionDefs[rxnIdx]
            ps = reaction.RunReactants((node.mol, ))
            #print '    ',len(ps)
            if ps:
                for prodSeq in ps:
                    seqOk = True
                    # we want to disqualify small fragments, so sort the product sequence by size
                    # and then look for "forbidden" fragments
                    tSeq = [(prod.GetNumAtoms(onlyExplicit=True), idx)
                            for idx, prod in enumerate(prodSeq)]
                    tSeq.sort()
                    ts = [(x, prodSeq[y]) for x, y in tSeq]
                    prodSeq = ts
                    for nats, prod in prodSeq:
                        try:
                            Chem.SanitizeMol(prod)
                        except:
                            continue
                        pSmi = Chem.MolToSmiles(prod, 1)
                        if minFragmentSize > 0:
                            nDummies = pSmi.count('*')
                            if nats - nDummies < minFragmentSize:
                                seqOk = False
                                break
                        # don't forget after replacing dummy atoms to remove any empty
                        # branches:
                        elif pSmi.replace('[*]',
                                          '').replace('()',
                                                      '') in ('', 'C', 'CC',
                                                              'CCC'):
                            seqOk = False
                            break
                        prod.pSmi = pSmi
                    if seqOk:
                        for nats, prod in prodSeq:
                            pSmi = prod.pSmi
                            #print '\t',nats,pSmi
                            if not pSmi in allNodes:
                                pNode = RecapHierarchyNode(prod)
                                pNode.smiles = pSmi
                                pNode.parents[nSmi] = weakref.proxy(node)
                                node.children[pSmi] = pNode
                                activePool[pSmi] = pNode
                                allNodes[pSmi] = pNode
                            else:
                                pNode = allNodes[pSmi]
                                pNode.parents[nSmi] = weakref.proxy(node)
                                node.children[pSmi] = pNode
                        #print '                >>an:',allNodes.keys()
    return res
Beispiel #27
0
def BRICSDecompose(mol,allNodes=None,minFragmentSize=1,onlyUseReactions=None,
                   silent=True,keepNonLeafNodes=False,singlePass=False,returnMols=False):
  """ returns the BRICS decomposition for a molecule

  >>> from rdkit import Chem
  >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1')
  >>> res = list(BRICSDecompose(m))
  >>> sorted(res)
  ['[14*]c1ccccn1', '[16*]c1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]']

  >>> res = list(BRICSDecompose(m,returnMols=True))
  >>> res[0]
  <rdkit.Chem.rdchem.Mol object ...>
  >>> smis = [Chem.MolToSmiles(x,True) for x in res]
  >>> sorted(smis)
  ['[14*]c1ccccn1', '[16*]c1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]']

  nexavar, an example from the paper (corrected):
  >>> m = Chem.MolFromSmiles('CNC(=O)C1=NC=CC(OC2=CC=C(NC(=O)NC3=CC(=C(Cl)C=C3)C(F)(F)F)C=C2)=C1')
  >>> res = list(BRICSDecompose(m))
  >>> sorted(res)
  ['[1*]C([1*])=O', '[1*]C([6*])=O', '[14*]c1cc([16*])ccn1', '[16*]c1ccc(Cl)c([16*])c1', '[16*]c1ccc([16*])cc1', '[3*]O[3*]', '[5*]NC', '[5*]N[5*]', '[8*]C(F)(F)F']

  it's also possible to keep pieces that haven't been fully decomposed:
  >>> m = Chem.MolFromSmiles('CCCOCC')
  >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True))
  >>> sorted(res)
  ['CCCOCC', '[3*]OCC', '[3*]OCCC', '[3*]O[3*]', '[4*]CC', '[4*]CCC']

  >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1')
  >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True))
  >>> sorted(res)
  ['CCCOCc1cccc(-c2ccccn2)c1', '[14*]c1ccccn1', '[16*]c1cccc(-c2ccccn2)c1', '[16*]c1cccc(COCCC)c1', '[16*]c1cccc([16*])c1', '[3*]OCCC', '[3*]OC[8*]', '[3*]OCc1cccc(-c2ccccn2)c1', '[3*]OCc1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]', '[4*]Cc1cccc(-c2ccccn2)c1', '[4*]Cc1cccc([16*])c1', '[8*]COCCC']

  or to only do a single pass of decomposition:
  >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1')
  >>> res = list(BRICSDecompose(m,singlePass=True))
  >>> sorted(res)
  ['CCCOCc1cccc(-c2ccccn2)c1', '[14*]c1ccccn1', '[16*]c1cccc(-c2ccccn2)c1', '[16*]c1cccc(COCCC)c1', '[3*]OCCC', '[3*]OCc1cccc(-c2ccccn2)c1', '[4*]CCC', '[4*]Cc1cccc(-c2ccccn2)c1', '[8*]COCCC']

  setting a minimum size for the fragments:
  >>> m = Chem.MolFromSmiles('CCCOCC')
  >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True,minFragmentSize=2))
  >>> sorted(res)
  ['CCCOCC', '[3*]OCC', '[3*]OCCC', '[4*]CC', '[4*]CCC']
  >>> m = Chem.MolFromSmiles('CCCOCC')
  >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True,minFragmentSize=3))
  >>> sorted(res)
  ['CCCOCC', '[3*]OCC', '[4*]CCC']
  >>> res = list(BRICSDecompose(m,minFragmentSize=2))
  >>> sorted(res)
  ['[3*]OCC', '[3*]OCCC', '[4*]CC', '[4*]CCC']


  """
  global reactions
  mSmi = Chem.MolToSmiles(mol,1)
  
  if allNodes is None:
    allNodes=set()

  if mSmi in allNodes:
    return set()

  activePool={mSmi:mol}
  allNodes.add(mSmi)
  foundMols={mSmi:mol}
  for gpIdx,reactionGp in enumerate(reactions):
    newPool = {}
    while activePool:
      matched=False
      nSmi = next(iterkeys(activePool))
      mol = activePool.pop(nSmi)
      for rxnIdx,reaction in enumerate(reactionGp):
        if onlyUseReactions and (gpIdx,rxnIdx) not in onlyUseReactions:
          continue
        if not silent:
          print('--------')
          print(smartsGps[gpIdx][rxnIdx])
        ps = reaction.RunReactants((mol,))
        if ps:
          if not silent: print(nSmi,'->',len(ps),'products')
          for prodSeq in ps:
            seqOk=True
            # we want to disqualify small fragments, so sort the product sequence by size
            tSeq = [(prod.GetNumAtoms(onlyExplicit=True),idx) for idx,prod in enumerate(prodSeq)]
            tSeq.sort()
            for nats,idx in tSeq:
              prod = prodSeq[idx]
              try:
                Chem.SanitizeMol(prod)
              except:
                continue
              pSmi = Chem.MolToSmiles(prod,1)
              if minFragmentSize>0:
                nDummies = pSmi.count('*')
                if nats-nDummies<minFragmentSize:
                  seqOk=False
                  break
              prod.pSmi = pSmi
            ts = [(x,prodSeq[y]) for x,y in tSeq]
            prodSeq=ts
            if seqOk:
              matched=True
              for nats,prod in prodSeq:
                pSmi = prod.pSmi
                #print('\t',nats,pSmi)
                if pSmi not in allNodes:
                  if not singlePass:
                    activePool[pSmi] = prod
                  allNodes.add(pSmi)
                  foundMols[pSmi]=prod
      if singlePass or keepNonLeafNodes or not matched:
        newPool[nSmi]=mol
    activePool = newPool
  if not (singlePass or keepNonLeafNodes):
    if not returnMols:
      res = set(activePool.keys())
    else:
      res = activePool.values()
  else:
    if not returnMols:
      res = allNodes
    else:
      res = foundMols.values()
  return res
Beispiel #28
0
def RecapDecompose(mol,allNodes=None,minFragmentSize=0,onlyUseReactions=None):
  """ returns the recap decomposition for a molecule """
  mSmi = Chem.MolToSmiles(mol,1)

  if allNodes is None:
    allNodes={}
  if mSmi in allNodes:
    return allNodes[mSmi]

  res = RecapHierarchyNode(mol)
  res.smiles =mSmi
  activePool={mSmi:res}
  allNodes[mSmi]=res
  while activePool:
    nSmi = next(iterkeys(activePool))
    node = activePool.pop(nSmi)
    if not node.mol: continue
    for rxnIdx,reaction in enumerate(reactions):
      if onlyUseReactions and rxnIdx not in onlyUseReactions:
        continue
      #print '  .',nSmi
      #print '         !!!!',rxnIdx,nSmi,reactionDefs[rxnIdx]
      ps = reaction.RunReactants((node.mol,))
      #print '    ',len(ps)
      if ps:
        for prodSeq in ps:
          seqOk=True
          # we want to disqualify small fragments, so sort the product sequence by size
          # and then look for "forbidden" fragments
          tSeq = [(prod.GetNumAtoms(onlyExplicit=True),idx) for idx,prod in enumerate(prodSeq)]
          tSeq.sort()
          ts=[(x,prodSeq[y]) for x,y in tSeq]
          prodSeq=ts
          for nats,prod in prodSeq:
            try:
              Chem.SanitizeMol(prod)
            except:
              continue
            pSmi = Chem.MolToSmiles(prod,1)
            if minFragmentSize>0:
              nDummies = pSmi.count('*')
              if nats-nDummies<minFragmentSize:
                seqOk=False
                break
            # don't forget after replacing dummy atoms to remove any empty
            # branches:
            elif pSmi.replace('[*]','').replace('()','') in ('','C','CC','CCC'):
              seqOk=False
              break
            prod.pSmi = pSmi
          if seqOk:
            for nats,prod in prodSeq:
              pSmi = prod.pSmi
              #print '\t',nats,pSmi
              if not pSmi in allNodes:
                pNode = RecapHierarchyNode(prod)
                pNode.smiles=pSmi
                pNode.parents[nSmi]=weakref.proxy(node)
                node.children[pSmi]=pNode
                activePool[pSmi] = pNode
                allNodes[pSmi]=pNode
              else:
                pNode=allNodes[pSmi]
                pNode.parents[nSmi]=weakref.proxy(node)
                node.children[pSmi]=pNode
            #print '                >>an:',allNodes.keys()
  return res
Beispiel #29
0
def BRICSDecompose(mol,
                   allNodes=None,
                   minFragmentSize=1,
                   onlyUseReactions=None,
                   silent=True,
                   keepNonLeafNodes=False,
                   singlePass=False,
                   returnMols=False):
    """ returns the BRICS decomposition for a molecule

  >>> from rdkit import Chem
  >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1')
  >>> res = list(BRICSDecompose(m))
  >>> sorted(res)
  ['[14*]c1ccccn1', '[16*]c1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]']

  >>> res = list(BRICSDecompose(m,returnMols=True))
  >>> res[0]
  <rdkit.Chem.rdchem.Mol object ...>
  >>> smis = [Chem.MolToSmiles(x,True) for x in res]
  >>> sorted(smis)
  ['[14*]c1ccccn1', '[16*]c1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]']

  nexavar, an example from the paper (corrected):
  >>> m = Chem.MolFromSmiles('CNC(=O)C1=NC=CC(OC2=CC=C(NC(=O)NC3=CC(=C(Cl)C=C3)C(F)(F)F)C=C2)=C1')
  >>> res = list(BRICSDecompose(m))
  >>> sorted(res)
  ['[1*]C([1*])=O', '[1*]C([6*])=O', '[14*]c1cc([16*])ccn1', '[16*]c1ccc(Cl)c([16*])c1', '[16*]c1ccc([16*])cc1', '[3*]O[3*]', '[5*]NC', '[5*]N[5*]', '[8*]C(F)(F)F']

  it's also possible to keep pieces that haven't been fully decomposed:
  >>> m = Chem.MolFromSmiles('CCCOCC')
  >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True))
  >>> sorted(res)
  ['CCCOCC', '[3*]OCC', '[3*]OCCC', '[3*]O[3*]', '[4*]CC', '[4*]CCC']

  >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1')
  >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True))
  >>> sorted(res)
  ['CCCOCc1cccc(-c2ccccn2)c1', '[14*]c1ccccn1', '[16*]c1cccc(-c2ccccn2)c1', '[16*]c1cccc(COCCC)c1', '[16*]c1cccc([16*])c1', '[3*]OCCC', '[3*]OC[8*]', '[3*]OCc1cccc(-c2ccccn2)c1', '[3*]OCc1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]', '[4*]Cc1cccc(-c2ccccn2)c1', '[4*]Cc1cccc([16*])c1', '[8*]COCCC']

  or to only do a single pass of decomposition:
  >>> m = Chem.MolFromSmiles('CCCOCc1cc(c2ncccc2)ccc1')
  >>> res = list(BRICSDecompose(m,singlePass=True))
  >>> sorted(res)
  ['CCCOCc1cccc(-c2ccccn2)c1', '[14*]c1ccccn1', '[16*]c1cccc(-c2ccccn2)c1', '[16*]c1cccc(COCCC)c1', '[3*]OCCC', '[3*]OCc1cccc(-c2ccccn2)c1', '[4*]CCC', '[4*]Cc1cccc(-c2ccccn2)c1', '[8*]COCCC']

  setting a minimum size for the fragments:
  >>> m = Chem.MolFromSmiles('CCCOCC')
  >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True,minFragmentSize=2))
  >>> sorted(res)
  ['CCCOCC', '[3*]OCC', '[3*]OCCC', '[4*]CC', '[4*]CCC']
  >>> m = Chem.MolFromSmiles('CCCOCC')
  >>> res = list(BRICSDecompose(m,keepNonLeafNodes=True,minFragmentSize=3))
  >>> sorted(res)
  ['CCCOCC', '[3*]OCC', '[4*]CCC']
  >>> res = list(BRICSDecompose(m,minFragmentSize=2))
  >>> sorted(res)
  ['[3*]OCC', '[3*]OCCC', '[4*]CC', '[4*]CCC']


  """
    global reactions
    mSmi = Chem.MolToSmiles(mol, 1)

    if allNodes is None:
        allNodes = set()

    if mSmi in allNodes:
        return set()

    activePool = {mSmi: mol}
    allNodes.add(mSmi)
    foundMols = {mSmi: mol}
    for gpIdx, reactionGp in enumerate(reactions):
        newPool = {}
        while activePool:
            matched = False
            nSmi = next(iterkeys(activePool))
            mol = activePool.pop(nSmi)
            for rxnIdx, reaction in enumerate(reactionGp):
                if onlyUseReactions and (gpIdx,
                                         rxnIdx) not in onlyUseReactions:
                    continue
                if not silent:
                    print('--------')
                    print(smartsGps[gpIdx][rxnIdx])
                ps = reaction.RunReactants((mol, ))
                if ps:
                    if not silent:
                        print(nSmi, '->', len(ps), 'products')
                    for prodSeq in ps:
                        seqOk = True
                        # we want to disqualify small fragments, so sort the product sequence by size
                        tSeq = [(prod.GetNumAtoms(onlyExplicit=True), idx)
                                for idx, prod in enumerate(prodSeq)]
                        tSeq.sort()
                        for nats, idx in tSeq:
                            prod = prodSeq[idx]
                            try:
                                Chem.SanitizeMol(prod)
                            except Exception:
                                continue
                            pSmi = Chem.MolToSmiles(prod, 1)
                            if minFragmentSize > 0:
                                nDummies = pSmi.count('*')
                                if nats - nDummies < minFragmentSize:
                                    seqOk = False
                                    break
                            prod.pSmi = pSmi
                        ts = [(x, prodSeq[y]) for x, y in tSeq]
                        prodSeq = ts
                        if seqOk:
                            matched = True
                            for nats, prod in prodSeq:
                                pSmi = prod.pSmi
                                #print('\t',nats,pSmi)
                                if pSmi not in allNodes:
                                    if not singlePass:
                                        activePool[pSmi] = prod
                                    allNodes.add(pSmi)
                                    foundMols[pSmi] = prod
            if singlePass or keepNonLeafNodes or not matched:
                newPool[nSmi] = mol
        activePool = newPool
    if not (singlePass or keepNonLeafNodes):
        if not returnMols:
            res = set(activePool.keys())
        else:
            res = activePool.values()
    else:
        if not returnMols:
            res = allNodes
        else:
            res = foundMols.values()
    return res