Example #1
0
 def test2ranker(self):
     nbits = 100
     ninst = 100
     dm = 50
     nact = 10
     nc = 2
     RDRandom.seed(23)
     rn = rdit.InfoBitRanker(nbits, nc, rdit.InfoType.ENTROPY)
     rn.SetMaskBits([63, 70, 15, 25, 10])
     fps = []
     na = 0
     ni = 0
     for i in range(ninst):
         v = DataStructs.SparseBitVect(nbits)
         for j in range(dm):
             v.SetBit(RDRandom.randrange(0, nbits))
         if (RDRandom.randrange(0, ninst) < nact):
             na += 1
             rn.AccumulateVotes(v, 1)
             fps.append((v, 1))
         else:
             ni += 1
             rn.AccumulateVotes(v, 0)
             fps.append((v, 0))
     res = rn.GetTopN(5)
     ids = [int(x[0]) for x in res]
     ids.sort()
     self.assertTrue(ids == [10, 15, 25, 63, 70])
     with self.assertRaisesRegexp(Exception, ""):
         res = rn.GetTopN(10)
Example #2
0
 def test2ranker(self):
   nbits = 100
   ninst = 100
   dm = 50
   nact = 10
   nc = 2
   RDRandom.seed(23)
   rn = rdit.InfoBitRanker(nbits, nc, rdit.InfoType.ENTROPY)
   rn.SetMaskBits([63, 70, 15, 25, 10])
   fps = []
   na = 0
   ni = 0
   for i in range(ninst):
     v = DataStructs.SparseBitVect(nbits)
     for j in range(dm):
       v.SetBit(RDRandom.randrange(0, nbits))
     if (RDRandom.randrange(0, ninst) < nact):
       na += 1
       rn.AccumulateVotes(v, 1)
       fps.append((v, 1))
     else:
       ni += 1
       rn.AccumulateVotes(v, 0)
       fps.append((v, 0))
   res = rn.GetTopN(5)
   ids = [int(x[0]) for x in res]
   ids.sort()
   self.assertTrue(ids == [10, 15, 25, 63, 70])
   with self.assertRaisesRegexp(Exception, ""):
     res = rn.GetTopN(10)
Example #3
0
 def testReplacementSelection(self):
   # " use selection with replacement "
   RDRandom.seed(self.randomSeed)
   examples, attrs, nPossibleVals = randomtest.GenRandomExamples(nExamples=200,
                                                                 seed=self.randomArraySeed)
   tree, frac = CrossValidate.CrossValidationDriver(examples, attrs, nPossibleVals, silent=1,
                                                    replacementSelection=1)
   self.assertTrue(tree)
   self.assertAlmostEqual(frac, 0.01666, 4)
Example #4
0
 def testReplacementSelection(self):
     # " use selection with replacement "
     RDRandom.seed(self.randomSeed)
     examples, attrs, nPossibleVals = randomtest.GenRandomExamples(
         nExamples=200, seed=self.randomArraySeed)
     tree, frac = CrossValidate.CrossValidationDriver(
         examples, attrs, nPossibleVals, silent=1, replacementSelection=1)
     self.assertTrue(tree)
     self.assertAlmostEqual(frac, 0.01666, 4)
Example #5
0
 def testReplacementSelection(self):
     " use selection with replacement "
     from rdkit.ML.DecTree import randomtest
     from rdkit import RDRandom
     RDRandom.seed(self.randomSeed)
     examples, attrs, nPossibleVals = randomtest.GenRandomExamples(
         nExamples=200, seed=self.randomArraySeed)
     tree, frac = CrossValidate.CrossValidationDriver(
         examples, attrs, nPossibleVals, silent=1, replacementSelection=1)
     assert tree
     assert feq(frac, 0.0833)
Example #6
0
 def testReplacementSelection(self):
   " use selection with replacement "
   from rdkit.ML.DecTree import randomtest
   from rdkit import RDRandom
   RDRandom.seed(self.randomSeed)
   examples,attrs,nPossibleVals = randomtest.GenRandomExamples(nExamples = 200,
                                                               seed=self.randomArraySeed)
   tree,frac = CrossValidate.CrossValidationDriver(examples,attrs,
                                                   nPossibleVals,silent=1,
                                                   replacementSelection=1)
   assert tree
   assert feq(frac,0.0833)
Example #7
0
  def testResults(self):
    # " test the results of CrossValidation "
    RDRandom.seed(self.randomSeed)
    examples, attrs, nPossibleVals = randomtest.GenRandomExamples(nExamples=200,
                                                                  seed=self.randomArraySeed)
    tree, frac = CrossValidate.CrossValidationDriver(examples, attrs, nPossibleVals, silent=1)
    self.assertGreater(frac, 0)

    with open(self.origTreeName, 'r') as inTFile:
      buf = inTFile.read().replace('\r\n', '\n').encode('utf-8')
      inTFile.close()
    inFile = BytesIO(buf)
    oTree = cPickle.load(inFile)

    assert oTree == tree, 'Random CrossValidation test failed'
Example #8
0
def InitRandomNumbers(seed):
  """ Seeds the random number generators

    **Arguments**

      - seed: a 2-tuple containing integers to be used as the random number seeds

    **Notes**

      this seeds both the RDRandom generator and the one in the standard
      Python _random_ module

  """
  from rdkit import RDRandom
  RDRandom.seed(seed[0])
  random.seed(seed[0])
Example #9
0
def InitRandomNumbers(seed):
    """ Seeds the random number generators

    **Arguments**

      - seed: a 2-tuple containing integers to be used as the random number seeds

    **Notes**

      this seeds both the RDRandom generator and the one in the standard
      Python _random_ module

  """
    from rdkit import RDRandom
    RDRandom.seed(seed[0])
    random.seed(seed[0])
Example #10
0
  def testResults(self):
    " test the results of CrossValidation "
    from rdkit.ML.DecTree import randomtest
    from rdkit import RDRandom
    RDRandom.seed(self.randomSeed)
    examples,attrs,nPossibleVals = randomtest.GenRandomExamples(nExamples = 200,
                                                                seed=self.randomArraySeed)
    tree,frac = CrossValidate.CrossValidationDriver(examples,attrs,
                                                    nPossibleVals,silent=1)

    from rdkit.six.moves import cPickle
    #cPickle.dump(tree,open(self.origTreeName,'w+'))
    with open(self.origTreeName,'rb') as inFile:
      oTree = cPickle.load(inFile)

    assert oTree==tree,'Random CrossValidation test failed'
Example #11
0
    def testResults(self):
        " test the results of CrossValidation "
        from rdkit.ML.DecTree import randomtest
        from rdkit import RDRandom
        RDRandom.seed(self.randomSeed)
        examples, attrs, nPossibleVals = randomtest.GenRandomExamples(
            nExamples=200, seed=self.randomArraySeed)
        tree, frac = CrossValidate.CrossValidationDriver(examples,
                                                         attrs,
                                                         nPossibleVals,
                                                         silent=1)

        import cPickle
        #cPickle.dump(tree,file(self.origTreeName,'w+'))
        inFile = open(self.origTreeName, 'r')
        oTree = cPickle.load(inFile)

        assert oTree == tree, 'Random CrossValidation test failed'
Example #12
0
    def testResults(self):
        # " test the results of CrossValidation "
        RDRandom.seed(self.randomSeed)
        examples, attrs, nPossibleVals = randomtest.GenRandomExamples(
            nExamples=200, seed=self.randomArraySeed)
        tree, frac = CrossValidate.CrossValidationDriver(examples,
                                                         attrs,
                                                         nPossibleVals,
                                                         silent=1)
        self.assertGreater(frac, 0)

        with open(self.origTreeName, 'r') as inTFile:
            buf = inTFile.read().replace('\r\n', '\n').encode('utf-8')
            inTFile.close()
        inFile = BytesIO(buf)
        oTree = pickle.load(inFile)

        assert oTree == tree, 'Random CrossValidation test failed'
Example #13
0
    def test1ranker(self) :
        nbits = 100
        ninst = 100
        dm = 50
        nact = 10
        nc = 2
        rn = rdit.InfoBitRanker(nbits, nc, rdit.InfoType.ENTROPY)
        fps = []
        na = 0
        ni = 0
        for i in range(ninst) :
            v = DataStructs.SparseBitVect(nbits)
            for j in range(dm):
                v.SetBit(RDRandom.randrange(0,nbits))

            
            if (RDRandom.randrange(0,ninst) < nact) :
                na += 1
                rn.AccumulateVotes(v, 1)
                fps.append((v,1))
            else:
                ni += 1
                rn.AccumulateVotes(v, 0)
                fps.append((v,0))
                
        res =  rn.GetTopN(50)

        rn2 = rdit.InfoBitRanker(nbits, nc)
        for fp in fps:
            rn2.AccumulateVotes(fp[0], fp[1])

        res2 = rn2.GetTopN(50)
        self.assertTrue((res==res2).all())
        
        rn3 = rdit.InfoBitRanker(nbits, nc, rdit.InfoType.BIASENTROPY)
        #rn3.SetBiasList([0])
        for fp in fps:
            rn3.AccumulateVotes(fp[0], fp[1])

        res3 = rn3.GetTopN(50)
        for i in range(50) :
            fan = res3[i,2]/na
            fin = res3[i,3]/ni
            self.assertTrue(fan > fin)
Example #14
0
    def test1ranker(self):
        nbits = 100
        ninst = 100
        dm = 50
        nact = 10
        nc = 2
        rn = rdit.InfoBitRanker(nbits, nc, rdit.InfoType.ENTROPY)
        fps = []
        na = 0
        ni = 0
        for i in range(ninst):
            v = DataStructs.SparseBitVect(nbits)
            for j in range(dm):
                v.SetBit(RDRandom.randrange(0, nbits))

            if (RDRandom.randrange(0, ninst) < nact):
                na += 1
                rn.AccumulateVotes(v, 1)
                fps.append((v, 1))
            else:
                ni += 1
                rn.AccumulateVotes(v, 0)
                fps.append((v, 0))

        res = rn.GetTopN(50)

        rn2 = rdit.InfoBitRanker(nbits, nc)
        for fp in fps:
            rn2.AccumulateVotes(fp[0], fp[1])

        res2 = rn2.GetTopN(50)
        self.assertTrue((res == res2).all())

        rn3 = rdit.InfoBitRanker(nbits, nc, rdit.InfoType.BIASENTROPY)
        #rn3.SetBiasList([0])
        for fp in fps:
            rn3.AccumulateVotes(fp[0], fp[1])

        res3 = rn3.GetTopN(50)
        for i in range(50):
            fan = res3[i, 2] / na
            fin = res3[i, 3] / ni
            self.assertTrue(fan > fin)
Example #15
0
    def testResults(self):
        " test the results of CrossValidation "
        from rdkit.ML.DecTree import randomtest
        from rdkit import RDRandom

        RDRandom.seed(self.randomSeed)
        examples, attrs, nPossibleVals = randomtest.GenRandomExamples(nExamples=200, seed=self.randomArraySeed)
        tree, frac = CrossValidate.CrossValidationDriver(examples, attrs, nPossibleVals, silent=1)

        from rdkit.six.moves import cPickle

        # cPickle.dump(tree,open(self.origTreeName,'w+'))
        with open(self.origTreeName, "r") as inTFile:
            buf = inTFile.read().replace("\r\n", "\n").encode("utf-8")
            inTFile.close()
        with io.BytesIO(buf) as inFile:
            oTree = cPickle.load(inFile)

        assert oTree == tree, "Random CrossValidation test failed"
Example #16
0
  def testResults(self):
    " test the results of CrossValidation "
    from rdkit.ML.DecTree import randomtest
    from rdkit import RDRandom
    RDRandom.seed(self.randomSeed)
    examples,attrs,nPossibleVals = randomtest.GenRandomExamples(nExamples = 200,
                                                                seed=self.randomArraySeed)
    tree,frac = CrossValidate.CrossValidationDriver(examples,attrs,
                                                    nPossibleVals,silent=1)

    from rdkit.six.moves import cPickle
    #cPickle.dump(tree,open(self.origTreeName,'w+'))
    with open(self.origTreeName,'r') as inTFile:
      buf = inTFile.read().replace('\r\n', '\n').encode('utf-8')
      inTFile.close()
    with io.BytesIO(buf) as inFile:
      oTree = cPickle.load(inFile)

    assert oTree==tree,'Random CrossValidation test failed'
Example #17
0
 def setUp(self):
     RDRandom.seed(25)
Example #18
0
 def setUp(self):
   RDRandom.seed(25)
Example #19
0
def SplitIndices(nPts, frac, silent=1, legacy=0, replacement=0):
    """ splits a set of indices into a data set into 2 pieces

    **Arguments**

     - nPts: the total number of points

     - frac: the fraction of the data to be put in the first data set

     - silent: (optional) toggles display of stats

     - legacy: (optional) use the legacy splitting approach

     - replacement: (optional) use selection with replacement

   **Returns**

     a 2-tuple containing the two sets of indices.

   **Notes**

     - the _legacy_ splitting approach uses randomly-generated floats
       and compares them to _frac_.  This is provided for
       backwards-compatibility reasons.

     - the default splitting approach uses a random permutation of
       indices which is split into two parts.

     - selection with replacement can generate duplicates.


  **Usage**:

  We'll start with a set of indices and pick from them using
  the three different approaches:
  >>> from rdkit.ML.Data import DataUtils

  The base approach always returns the same number of compounds in
  each set and has no duplicates:
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5)
  >>> test
  [1, 5, 6, 4, 2]
  >>> train
  [3, 0, 7, 8, 9]

  >>> test,train = SplitIndices(10,.5)
  >>> test
  [5, 2, 9, 8, 7]
  >>> train
  [6, 0, 3, 1, 4]


  The legacy approach can return varying numbers, but still has no
  duplicates.  Note the indices come back ordered:
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5,legacy=1)
  >>> test
  [0, 1, 2, 3, 4, 7, 9]
  >>> train
  [5, 6, 8]
  >>> test,train = SplitIndices(10,.5,legacy=1)
  >>> test
  [4, 5, 7, 8, 9]
  >>> train
  [0, 1, 2, 3, 6]

  The replacement approach returns a fixed number in the training set,
  a variable number in the test set and can contain duplicates in the
  training set. 
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5,replacement=1)
  >>> test
  [1, 1, 3, 0, 1]
  >>> train
  [2, 4, 5, 6, 7, 8, 9]
  >>> test,train = SplitIndices(10,.5,replacement=1)
  >>> test
  [9, 5, 4, 8, 0]
  >>> train
  [1, 2, 3, 6, 7]
  
  """
    if frac < 0. or frac > 1.:
        raise ValueError('frac must be between 0.0 and 1.0 (frac=%f)' % (frac))

    if replacement:
        nTrain = int(nPts * frac)
        resData = [None] * nTrain
        resTest = []
        for i in range(nTrain):
            val = int(RDRandom.random() * nPts)
            if val == nPts: val = nPts - 1
            resData[i] = val
        for i in range(nPts):
            if i not in resData:
                resTest.append(i)
    elif legacy:
        resData = []
        resTest = []
        for i in range(nPts):
            val = RDRandom.random()
            if val < frac:
                resData.append(i)
            else:
                resTest.append(i)
    else:
        perm = range(nPts)
        random.shuffle(perm)
        nTrain = int(nPts * frac)

        resData = list(perm[:nTrain])
        resTest = list(perm[nTrain:])

    if not silent:
        print 'Training with %d (of %d) points.' % (len(resData), nPts)
        print '\t%d points are in the hold-out set.' % (len(resTest))
    return resData, resTest
Example #20
0
from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv
import numpy
from rdkit import RDRandom
RDRandom.seed(23)


pkr = rdsimdiv.MaxMinPicker()

n = 1000
m = 80
dataPts = []
for i in range(n) :
    pt = numpy.zeros(2, 'd')
    pt[0] = 10.*RDRandom.random()
    pt[1] = 10.*RDRandom.random()
    dataPts.append(pt)

# compute the distance matrix
distMat = numpy.zeros(n*(n-1)/2, 'd')
for i in range(n-1) :
    itab = n*i - ((i+1)*(i+2))/2
    pt1 = dataPts[i]
    for j in range(i+1, n) :
        id = itab + j
        pt2 = dataPts[j]
        diff = pt2 - pt1
        
        dist = numpy.sqrt(numpy.dot(diff, diff))
        distMat[id] = dist
        
Example #21
0
def SplitIndices(nPts,frac,silent=1,legacy=0,replacement=0):
  """ splits a set of indices into a data set into 2 pieces

    **Arguments**

     - nPts: the total number of points

     - frac: the fraction of the data to be put in the first data set

     - silent: (optional) toggles display of stats

     - legacy: (optional) use the legacy splitting approach

     - replacement: (optional) use selection with replacement

   **Returns**

     a 2-tuple containing the two sets of indices.

   **Notes**

     - the _legacy_ splitting approach uses randomly-generated floats
       and compares them to _frac_.  This is provided for
       backwards-compatibility reasons.

     - the default splitting approach uses a random permutation of
       indices which is split into two parts.

     - selection with replacement can generate duplicates.


  **Usage**:

  We'll start with a set of indices and pick from them using
  the three different approaches:
  >>> from rdkit.ML.Data import DataUtils

  The base approach always returns the same number of compounds in
  each set and has no duplicates:
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5)
  >>> test
  [1, 5, 6, 4, 2]
  >>> train
  [3, 0, 7, 8, 9]

  >>> test,train = SplitIndices(10,.5)
  >>> test
  [5, 2, 9, 8, 7]
  >>> train
  [6, 0, 3, 1, 4]


  The legacy approach can return varying numbers, but still has no
  duplicates.  Note the indices come back ordered:
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5,legacy=1)
  >>> test
  [0, 1, 2, 3, 4, 7, 9]
  >>> train
  [5, 6, 8]
  >>> test,train = SplitIndices(10,.5,legacy=1)
  >>> test
  [4, 5, 7, 8, 9]
  >>> train
  [0, 1, 2, 3, 6]

  The replacement approach returns a fixed number in the training set,
  a variable number in the test set and can contain duplicates in the
  training set. 
  >>> DataUtils.InitRandomNumbers((23,42))
  >>> test,train = SplitIndices(10,.5,replacement=1)
  >>> test
  [1, 1, 3, 0, 1]
  >>> train
  [2, 4, 5, 6, 7, 8, 9]
  >>> test,train = SplitIndices(10,.5,replacement=1)
  >>> test
  [9, 5, 4, 8, 0]
  >>> train
  [1, 2, 3, 6, 7]
  
  """
  if frac<0. or frac > 1.:
    raise ValueError('frac must be between 0.0 and 1.0 (frac=%f)'%(frac))

  if replacement:
    nTrain = int(nPts*frac)
    resData = [None]*nTrain
    resTest = []
    for i in range(nTrain):
      val = int(RDRandom.random()*nPts)
      if val==nPts: val = nPts-1
      resData[i] = val
    for i in range(nPts):
      if i not in resData:
        resTest.append(i)
  elif legacy:
    resData = []
    resTest = []
    for i in range(nPts):
      val = RDRandom.random()
      if val < frac:
        resData.append(i)
      else:
        resTest.append(i)
  else:
    perm = range(nPts)
    random.shuffle(perm)
    nTrain = int(nPts*frac)
    
    resData = list(perm[:nTrain])
    resTest = list(perm[nTrain:])
        
  if not silent:
    print 'Training with %d (of %d) points.'%(len(resData),nPts)
    print '\t%d points are in the hold-out set.'%(len(resTest))
  return resData,resTest
Example #22
0
from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv
import numpy
from rdkit import RDRandom
RDRandom.seed(23)

pkr = rdsimdiv.MaxMinPicker()

n = 1000
m = 80
dataPts = []
for i in range(n):
    pt = numpy.zeros(2, 'd')
    pt[0] = 10. * RDRandom.random()
    pt[1] = 10. * RDRandom.random()
    dataPts.append(pt)

# compute the distance matrix
distMat = numpy.zeros(n * (n - 1) / 2, 'd')
for i in range(n - 1):
    itab = n * i - ((i + 1) * (i + 2)) / 2
    pt1 = dataPts[i]
    for j in range(i + 1, n):
        id = itab + j
        pt2 = dataPts[j]
        diff = pt2 - pt1

        dist = numpy.sqrt(numpy.dot(diff, diff))
        distMat[id] = dist

    # now do the picking
res = pkr.Pick(distMat, n, m)