def test2ranker(self): nbits = 100 ninst = 100 dm = 50 nact = 10 nc = 2 RDRandom.seed(23) rn = rdit.InfoBitRanker(nbits, nc, rdit.InfoType.ENTROPY) rn.SetMaskBits([63, 70, 15, 25, 10]) fps = [] na = 0 ni = 0 for i in range(ninst): v = DataStructs.SparseBitVect(nbits) for j in range(dm): v.SetBit(RDRandom.randrange(0, nbits)) if (RDRandom.randrange(0, ninst) < nact): na += 1 rn.AccumulateVotes(v, 1) fps.append((v, 1)) else: ni += 1 rn.AccumulateVotes(v, 0) fps.append((v, 0)) res = rn.GetTopN(5) ids = [int(x[0]) for x in res] ids.sort() self.assertTrue(ids == [10, 15, 25, 63, 70]) with self.assertRaisesRegexp(Exception, ""): res = rn.GetTopN(10)
def testReplacementSelection(self): # " use selection with replacement " RDRandom.seed(self.randomSeed) examples, attrs, nPossibleVals = randomtest.GenRandomExamples(nExamples=200, seed=self.randomArraySeed) tree, frac = CrossValidate.CrossValidationDriver(examples, attrs, nPossibleVals, silent=1, replacementSelection=1) self.assertTrue(tree) self.assertAlmostEqual(frac, 0.01666, 4)
def testReplacementSelection(self): # " use selection with replacement " RDRandom.seed(self.randomSeed) examples, attrs, nPossibleVals = randomtest.GenRandomExamples( nExamples=200, seed=self.randomArraySeed) tree, frac = CrossValidate.CrossValidationDriver( examples, attrs, nPossibleVals, silent=1, replacementSelection=1) self.assertTrue(tree) self.assertAlmostEqual(frac, 0.01666, 4)
def testReplacementSelection(self): " use selection with replacement " from rdkit.ML.DecTree import randomtest from rdkit import RDRandom RDRandom.seed(self.randomSeed) examples, attrs, nPossibleVals = randomtest.GenRandomExamples( nExamples=200, seed=self.randomArraySeed) tree, frac = CrossValidate.CrossValidationDriver( examples, attrs, nPossibleVals, silent=1, replacementSelection=1) assert tree assert feq(frac, 0.0833)
def testReplacementSelection(self): " use selection with replacement " from rdkit.ML.DecTree import randomtest from rdkit import RDRandom RDRandom.seed(self.randomSeed) examples,attrs,nPossibleVals = randomtest.GenRandomExamples(nExamples = 200, seed=self.randomArraySeed) tree,frac = CrossValidate.CrossValidationDriver(examples,attrs, nPossibleVals,silent=1, replacementSelection=1) assert tree assert feq(frac,0.0833)
def testResults(self): # " test the results of CrossValidation " RDRandom.seed(self.randomSeed) examples, attrs, nPossibleVals = randomtest.GenRandomExamples(nExamples=200, seed=self.randomArraySeed) tree, frac = CrossValidate.CrossValidationDriver(examples, attrs, nPossibleVals, silent=1) self.assertGreater(frac, 0) with open(self.origTreeName, 'r') as inTFile: buf = inTFile.read().replace('\r\n', '\n').encode('utf-8') inTFile.close() inFile = BytesIO(buf) oTree = cPickle.load(inFile) assert oTree == tree, 'Random CrossValidation test failed'
def InitRandomNumbers(seed): """ Seeds the random number generators **Arguments** - seed: a 2-tuple containing integers to be used as the random number seeds **Notes** this seeds both the RDRandom generator and the one in the standard Python _random_ module """ from rdkit import RDRandom RDRandom.seed(seed[0]) random.seed(seed[0])
def testResults(self): " test the results of CrossValidation " from rdkit.ML.DecTree import randomtest from rdkit import RDRandom RDRandom.seed(self.randomSeed) examples,attrs,nPossibleVals = randomtest.GenRandomExamples(nExamples = 200, seed=self.randomArraySeed) tree,frac = CrossValidate.CrossValidationDriver(examples,attrs, nPossibleVals,silent=1) from rdkit.six.moves import cPickle #cPickle.dump(tree,open(self.origTreeName,'w+')) with open(self.origTreeName,'rb') as inFile: oTree = cPickle.load(inFile) assert oTree==tree,'Random CrossValidation test failed'
def testResults(self): " test the results of CrossValidation " from rdkit.ML.DecTree import randomtest from rdkit import RDRandom RDRandom.seed(self.randomSeed) examples, attrs, nPossibleVals = randomtest.GenRandomExamples( nExamples=200, seed=self.randomArraySeed) tree, frac = CrossValidate.CrossValidationDriver(examples, attrs, nPossibleVals, silent=1) import cPickle #cPickle.dump(tree,file(self.origTreeName,'w+')) inFile = open(self.origTreeName, 'r') oTree = cPickle.load(inFile) assert oTree == tree, 'Random CrossValidation test failed'
def testResults(self): # " test the results of CrossValidation " RDRandom.seed(self.randomSeed) examples, attrs, nPossibleVals = randomtest.GenRandomExamples( nExamples=200, seed=self.randomArraySeed) tree, frac = CrossValidate.CrossValidationDriver(examples, attrs, nPossibleVals, silent=1) self.assertGreater(frac, 0) with open(self.origTreeName, 'r') as inTFile: buf = inTFile.read().replace('\r\n', '\n').encode('utf-8') inTFile.close() inFile = BytesIO(buf) oTree = pickle.load(inFile) assert oTree == tree, 'Random CrossValidation test failed'
def test1ranker(self) : nbits = 100 ninst = 100 dm = 50 nact = 10 nc = 2 rn = rdit.InfoBitRanker(nbits, nc, rdit.InfoType.ENTROPY) fps = [] na = 0 ni = 0 for i in range(ninst) : v = DataStructs.SparseBitVect(nbits) for j in range(dm): v.SetBit(RDRandom.randrange(0,nbits)) if (RDRandom.randrange(0,ninst) < nact) : na += 1 rn.AccumulateVotes(v, 1) fps.append((v,1)) else: ni += 1 rn.AccumulateVotes(v, 0) fps.append((v,0)) res = rn.GetTopN(50) rn2 = rdit.InfoBitRanker(nbits, nc) for fp in fps: rn2.AccumulateVotes(fp[0], fp[1]) res2 = rn2.GetTopN(50) self.assertTrue((res==res2).all()) rn3 = rdit.InfoBitRanker(nbits, nc, rdit.InfoType.BIASENTROPY) #rn3.SetBiasList([0]) for fp in fps: rn3.AccumulateVotes(fp[0], fp[1]) res3 = rn3.GetTopN(50) for i in range(50) : fan = res3[i,2]/na fin = res3[i,3]/ni self.assertTrue(fan > fin)
def test1ranker(self): nbits = 100 ninst = 100 dm = 50 nact = 10 nc = 2 rn = rdit.InfoBitRanker(nbits, nc, rdit.InfoType.ENTROPY) fps = [] na = 0 ni = 0 for i in range(ninst): v = DataStructs.SparseBitVect(nbits) for j in range(dm): v.SetBit(RDRandom.randrange(0, nbits)) if (RDRandom.randrange(0, ninst) < nact): na += 1 rn.AccumulateVotes(v, 1) fps.append((v, 1)) else: ni += 1 rn.AccumulateVotes(v, 0) fps.append((v, 0)) res = rn.GetTopN(50) rn2 = rdit.InfoBitRanker(nbits, nc) for fp in fps: rn2.AccumulateVotes(fp[0], fp[1]) res2 = rn2.GetTopN(50) self.assertTrue((res == res2).all()) rn3 = rdit.InfoBitRanker(nbits, nc, rdit.InfoType.BIASENTROPY) #rn3.SetBiasList([0]) for fp in fps: rn3.AccumulateVotes(fp[0], fp[1]) res3 = rn3.GetTopN(50) for i in range(50): fan = res3[i, 2] / na fin = res3[i, 3] / ni self.assertTrue(fan > fin)
def testResults(self): " test the results of CrossValidation " from rdkit.ML.DecTree import randomtest from rdkit import RDRandom RDRandom.seed(self.randomSeed) examples, attrs, nPossibleVals = randomtest.GenRandomExamples(nExamples=200, seed=self.randomArraySeed) tree, frac = CrossValidate.CrossValidationDriver(examples, attrs, nPossibleVals, silent=1) from rdkit.six.moves import cPickle # cPickle.dump(tree,open(self.origTreeName,'w+')) with open(self.origTreeName, "r") as inTFile: buf = inTFile.read().replace("\r\n", "\n").encode("utf-8") inTFile.close() with io.BytesIO(buf) as inFile: oTree = cPickle.load(inFile) assert oTree == tree, "Random CrossValidation test failed"
def testResults(self): " test the results of CrossValidation " from rdkit.ML.DecTree import randomtest from rdkit import RDRandom RDRandom.seed(self.randomSeed) examples,attrs,nPossibleVals = randomtest.GenRandomExamples(nExamples = 200, seed=self.randomArraySeed) tree,frac = CrossValidate.CrossValidationDriver(examples,attrs, nPossibleVals,silent=1) from rdkit.six.moves import cPickle #cPickle.dump(tree,open(self.origTreeName,'w+')) with open(self.origTreeName,'r') as inTFile: buf = inTFile.read().replace('\r\n', '\n').encode('utf-8') inTFile.close() with io.BytesIO(buf) as inFile: oTree = cPickle.load(inFile) assert oTree==tree,'Random CrossValidation test failed'
def setUp(self): RDRandom.seed(25)
def SplitIndices(nPts, frac, silent=1, legacy=0, replacement=0): """ splits a set of indices into a data set into 2 pieces **Arguments** - nPts: the total number of points - frac: the fraction of the data to be put in the first data set - silent: (optional) toggles display of stats - legacy: (optional) use the legacy splitting approach - replacement: (optional) use selection with replacement **Returns** a 2-tuple containing the two sets of indices. **Notes** - the _legacy_ splitting approach uses randomly-generated floats and compares them to _frac_. This is provided for backwards-compatibility reasons. - the default splitting approach uses a random permutation of indices which is split into two parts. - selection with replacement can generate duplicates. **Usage**: We'll start with a set of indices and pick from them using the three different approaches: >>> from rdkit.ML.Data import DataUtils The base approach always returns the same number of compounds in each set and has no duplicates: >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5) >>> test [1, 5, 6, 4, 2] >>> train [3, 0, 7, 8, 9] >>> test,train = SplitIndices(10,.5) >>> test [5, 2, 9, 8, 7] >>> train [6, 0, 3, 1, 4] The legacy approach can return varying numbers, but still has no duplicates. Note the indices come back ordered: >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5,legacy=1) >>> test [0, 1, 2, 3, 4, 7, 9] >>> train [5, 6, 8] >>> test,train = SplitIndices(10,.5,legacy=1) >>> test [4, 5, 7, 8, 9] >>> train [0, 1, 2, 3, 6] The replacement approach returns a fixed number in the training set, a variable number in the test set and can contain duplicates in the training set. >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5,replacement=1) >>> test [1, 1, 3, 0, 1] >>> train [2, 4, 5, 6, 7, 8, 9] >>> test,train = SplitIndices(10,.5,replacement=1) >>> test [9, 5, 4, 8, 0] >>> train [1, 2, 3, 6, 7] """ if frac < 0. or frac > 1.: raise ValueError('frac must be between 0.0 and 1.0 (frac=%f)' % (frac)) if replacement: nTrain = int(nPts * frac) resData = [None] * nTrain resTest = [] for i in range(nTrain): val = int(RDRandom.random() * nPts) if val == nPts: val = nPts - 1 resData[i] = val for i in range(nPts): if i not in resData: resTest.append(i) elif legacy: resData = [] resTest = [] for i in range(nPts): val = RDRandom.random() if val < frac: resData.append(i) else: resTest.append(i) else: perm = range(nPts) random.shuffle(perm) nTrain = int(nPts * frac) resData = list(perm[:nTrain]) resTest = list(perm[nTrain:]) if not silent: print 'Training with %d (of %d) points.' % (len(resData), nPts) print '\t%d points are in the hold-out set.' % (len(resTest)) return resData, resTest
from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv import numpy from rdkit import RDRandom RDRandom.seed(23) pkr = rdsimdiv.MaxMinPicker() n = 1000 m = 80 dataPts = [] for i in range(n) : pt = numpy.zeros(2, 'd') pt[0] = 10.*RDRandom.random() pt[1] = 10.*RDRandom.random() dataPts.append(pt) # compute the distance matrix distMat = numpy.zeros(n*(n-1)/2, 'd') for i in range(n-1) : itab = n*i - ((i+1)*(i+2))/2 pt1 = dataPts[i] for j in range(i+1, n) : id = itab + j pt2 = dataPts[j] diff = pt2 - pt1 dist = numpy.sqrt(numpy.dot(diff, diff)) distMat[id] = dist
def SplitIndices(nPts,frac,silent=1,legacy=0,replacement=0): """ splits a set of indices into a data set into 2 pieces **Arguments** - nPts: the total number of points - frac: the fraction of the data to be put in the first data set - silent: (optional) toggles display of stats - legacy: (optional) use the legacy splitting approach - replacement: (optional) use selection with replacement **Returns** a 2-tuple containing the two sets of indices. **Notes** - the _legacy_ splitting approach uses randomly-generated floats and compares them to _frac_. This is provided for backwards-compatibility reasons. - the default splitting approach uses a random permutation of indices which is split into two parts. - selection with replacement can generate duplicates. **Usage**: We'll start with a set of indices and pick from them using the three different approaches: >>> from rdkit.ML.Data import DataUtils The base approach always returns the same number of compounds in each set and has no duplicates: >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5) >>> test [1, 5, 6, 4, 2] >>> train [3, 0, 7, 8, 9] >>> test,train = SplitIndices(10,.5) >>> test [5, 2, 9, 8, 7] >>> train [6, 0, 3, 1, 4] The legacy approach can return varying numbers, but still has no duplicates. Note the indices come back ordered: >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5,legacy=1) >>> test [0, 1, 2, 3, 4, 7, 9] >>> train [5, 6, 8] >>> test,train = SplitIndices(10,.5,legacy=1) >>> test [4, 5, 7, 8, 9] >>> train [0, 1, 2, 3, 6] The replacement approach returns a fixed number in the training set, a variable number in the test set and can contain duplicates in the training set. >>> DataUtils.InitRandomNumbers((23,42)) >>> test,train = SplitIndices(10,.5,replacement=1) >>> test [1, 1, 3, 0, 1] >>> train [2, 4, 5, 6, 7, 8, 9] >>> test,train = SplitIndices(10,.5,replacement=1) >>> test [9, 5, 4, 8, 0] >>> train [1, 2, 3, 6, 7] """ if frac<0. or frac > 1.: raise ValueError('frac must be between 0.0 and 1.0 (frac=%f)'%(frac)) if replacement: nTrain = int(nPts*frac) resData = [None]*nTrain resTest = [] for i in range(nTrain): val = int(RDRandom.random()*nPts) if val==nPts: val = nPts-1 resData[i] = val for i in range(nPts): if i not in resData: resTest.append(i) elif legacy: resData = [] resTest = [] for i in range(nPts): val = RDRandom.random() if val < frac: resData.append(i) else: resTest.append(i) else: perm = range(nPts) random.shuffle(perm) nTrain = int(nPts*frac) resData = list(perm[:nTrain]) resTest = list(perm[nTrain:]) if not silent: print 'Training with %d (of %d) points.'%(len(resData),nPts) print '\t%d points are in the hold-out set.'%(len(resTest)) return resData,resTest
from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv import numpy from rdkit import RDRandom RDRandom.seed(23) pkr = rdsimdiv.MaxMinPicker() n = 1000 m = 80 dataPts = [] for i in range(n): pt = numpy.zeros(2, 'd') pt[0] = 10. * RDRandom.random() pt[1] = 10. * RDRandom.random() dataPts.append(pt) # compute the distance matrix distMat = numpy.zeros(n * (n - 1) / 2, 'd') for i in range(n - 1): itab = n * i - ((i + 1) * (i + 2)) / 2 pt1 = dataPts[i] for j in range(i + 1, n): id = itab + j pt2 = dataPts[j] diff = pt2 - pt1 dist = numpy.sqrt(numpy.dot(diff, diff)) distMat[id] = dist # now do the picking res = pkr.Pick(distMat, n, m)