def test1(self): # " testing pruning with known results " oPts = [ [0, 0, 1, 0], [0, 1, 1, 1], [1, 0, 1, 1], [1, 1, 0, 0], [1, 1, 1, 1], ] tPts = oPts + [[0, 1, 1, 0], [0, 1, 1, 0]] tree = ID3.ID3Boot(oPts, attrs=range(3), nPossibleVals=[2] * 4) err, badEx = CrossValidate.CrossValidate(tree, oPts) assert err == 0.0, 'bad initial error' assert len(badEx) == 0, 'bad initial error' # prune with original data, shouldn't do anything f = StringIO() with redirect_stdout(f): PruneTree._verbose = True newTree, err = PruneTree.PruneTree(tree, [], oPts) PruneTree._verbose = False self.assertIn('Pruner', f.getvalue()) assert newTree == tree, 'improper pruning' # prune with train data newTree, err = PruneTree.PruneTree(tree, [], tPts) assert newTree != tree, 'bad pruning' assert feq(err, 0.14286), 'bad error result'
def _testChain(): from rdkit.ML.DecTree import ID3 oPts= [ \ [1,0,0,0,1], [1,0,0,0,1], [1,0,0,0,1], [1,0,0,0,1], [1,0,0,0,1], [1,0,0,0,1], [1,0,0,0,1], [0,0,1,1,0], [0,0,1,1,0], [0,0,1,1,1], [0,1,0,1,0], [0,1,0,1,0], [0,1,0,0,1], ] tPts = oPts tree = ID3.ID3Boot(oPts, attrs=range(len(oPts[0]) - 1), nPossibleVals=[2] * len(oPts[0])) tree.Print() err, badEx = CrossValidate.CrossValidate(tree, oPts) print('original error:', err) err, badEx = CrossValidate.CrossValidate(tree, tPts) print('original holdout error:', err) newTree, frac2 = PruneTree(tree, oPts, tPts) newTree.Print() err, badEx = CrossValidate.CrossValidate(newTree, tPts) print('pruned holdout error is:', err) print(badEx)
def _testSpecific(): from rdkit.ML.DecTree import ID3 oPts= [ \ [0,0,1,0], [0,1,1,1], [1,0,1,1], [1,1,0,0], [1,1,1,1], ] tPts = oPts + [[0, 1, 1, 0], [0, 1, 1, 0]] tree = ID3.ID3Boot(oPts, attrs=range(3), nPossibleVals=[2] * 4) tree.Print() err, badEx = CrossValidate.CrossValidate(tree, oPts) print('original error:', err) err, badEx = CrossValidate.CrossValidate(tree, tPts) print('original holdout error:', err) newTree, frac2 = PruneTree(tree, oPts, tPts) newTree.Print() err, badEx = CrossValidate.CrossValidate(newTree, tPts) print('pruned holdout error is:', err) print(badEx) print(len(tree), len(newTree))
def _setupMultiTree(self): examples = [[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 2], [0, 1, 1, 2], [1, 0, 0, 2], [1, 0, 1, 2], [1, 1, 0, 2], [1, 1, 1, 0]] data = MLData.MLQuantDataSet(examples) attrs = range(0, data.GetNVars()) t1 = ID3.ID3Boot(data.GetAllData(), attrs, data.GetNPossibleVals()) self.t1 = t1 self.examples = examples
def TestTree(): """ testing code for named trees """ examples1 = [['p1', 0, 1, 0, 0], ['p2', 0, 0, 0, 1], ['p3', 0, 0, 1, 2], ['p4', 0, 1, 1, 2], ['p5', 1, 0, 0, 2], ['p6', 1, 0, 1, 2], ['p7', 1, 1, 0, 2], ['p8', 1, 1, 1, 0]] attrs = list(range(1, len(examples1[0]) - 1)) nPossibleVals = [0, 2, 2, 2, 3] t1 = ID3.ID3Boot(examples1, attrs, nPossibleVals, maxDepth=1) t1.Print()
def _setupPyMultiTree(self): from rdkit.ML.InfoTheory import entropy ID3.entropy.InfoEntropy = entropy.PyInfoEntropy ID3.entropy.InfoGain = entropy.PyInfoGain examples = [[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 2], [0, 1, 1, 2], [1, 0, 0, 2], [1, 0, 1, 2], [1, 1, 0, 2], [1, 1, 1, 0]] data = MLData.MLQuantDataSet(examples) attrs = range(0, data.GetNVars()) t1 = ID3.ID3Boot(data.GetAllData(), attrs, data.GetNPossibleVals()) self.t1 = t1 self.examples = examples
def test1(self): " testing pruning with known results " oPts= [ \ [0,0,1,0], [0,1,1,1], [1,0,1,1], [1,1,0,0], [1,1,1,1], ] tPts = oPts+[[0,1,1,0],[0,1,1,0]] tree = ID3.ID3Boot(oPts,attrs=range(3),nPossibleVals=[2]*4) err,badEx = CrossValidate.CrossValidate(tree,oPts) assert err==0.0,'bad initial error' assert len(badEx)==0,'bad initial error' # prune with original data, shouldn't do anything newTree,err = PruneTree.PruneTree(tree,[],oPts) assert newTree==tree,'improper pruning' # prune with train data newTree,err = PruneTree.PruneTree(tree,[],tPts) assert newTree!=tree,'bad pruning' assert feq(err,0.14286),'bad error result'
def QuantTreeBoot(examples, attrs, nPossibleVals, nBoundsPerVar, initialVar=None, maxDepth=-1, **kwargs): """ Bootstrapping code for the QuantTree If _initialVar_ is not set, the algorithm will automatically choose the first variable in the tree (the standard greedy approach). Otherwise, _initialVar_ will be used as the first split. """ attrs = list(attrs) for i in range(len(nBoundsPerVar)): if nBoundsPerVar[i] == -1 and i in attrs: attrs.remove(i) tree = QuantTree.QuantTreeNode(None, 'node') nPossibleRes = nPossibleVals[-1] tree._nResultCodes = nPossibleRes resCodes = [int(x[-1]) for x in examples] counts = [0] * nPossibleRes for res in resCodes: counts[res] += 1 if initialVar is None: best, gainHere, qBounds = FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, **kwargs) else: best = initialVar if nBoundsPerVar[best] > 0: vTable = map(lambda x, z=best: x[z], examples) qBounds, gainHere = Quantize.FindVarMultQuantBounds( vTable, nBoundsPerVar[best], resCodes, nPossibleRes) elif nBoundsPerVar[best] == 0: vTable = ID3.GenVarTable(examples, nPossibleVals, [best])[0] gainHere = entropy.InfoGain(vTable) qBounds = [] else: gainHere = -1e6 qBounds = [] tree.SetName('Var: %d' % (best)) tree.SetData(gainHere) tree.SetLabel(best) tree.SetTerminal(0) tree.SetQuantBounds(qBounds) nextAttrs = list(attrs) if not kwargs.get('recycleVars', 0): nextAttrs.remove(best) indices = list(range(len(examples))) if len(qBounds) > 0: for bound in qBounds: nextExamples = [] for index in list(indices): ex = examples[index] if ex[best] < bound: nextExamples.append(ex) indices.remove(index) if len(nextExamples): tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) # add the last points remaining nextExamples = [] for index in indices: nextExamples.append(examples[index]) if len(nextExamples) != 0: tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) else: for val in range(nPossibleVals[best]): nextExamples = [] for example in examples: if example[best] == val: nextExamples.append(example) if len(nextExamples) != 0: tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) return tree
def FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, exIndices=None, **kwargs): bestGain = -1e6 best = -1 bestBounds = [] if exIndices is None: exIndices = list(range(len(examples))) if not len(exIndices): return best, bestGain, bestBounds nToTake = kwargs.get('randomDescriptors', 0) if nToTake > 0: nAttrs = len(attrs) if nToTake < nAttrs: ids = list(range(nAttrs)) random.shuffle(ids, random=random.random) tmp = [attrs[x] for x in ids[:nToTake]] attrs = tmp for var in attrs: nBounds = nBoundsPerVar[var] if nBounds > 0: # vTable = map(lambda x,z=var:x[z],examples) try: vTable = [examples[x][var] for x in exIndices] except IndexError: print('index error retrieving variable: %d' % var) raise qBounds, gainHere = Quantize.FindVarMultQuantBounds( vTable, nBounds, resCodes, nPossibleRes) # print('\tvar:',var,qBounds,gainHere) elif nBounds == 0: vTable = ID3.GenVarTable((examples[x] for x in exIndices), nPossibleVals, [var])[0] gainHere = entropy.InfoGain(vTable) qBounds = [] else: gainHere = -1e6 qBounds = [] if gainHere > bestGain: bestGain = gainHere bestBounds = qBounds best = var elif bestGain == gainHere: if len(qBounds) < len(bestBounds): best = var bestBounds = qBounds if best == -1: print('best unaltered') print('\tattrs:', attrs) print('\tnBounds:', numpy.take(nBoundsPerVar, attrs)) print('\texamples:') for example in (examples[x] for x in exIndices): print('\t\t', example) if 0: print('BEST:', len(exIndices), best, bestGain, bestBounds) if (len(exIndices) < 10): print(len(exIndices), len(resCodes), len(examples)) exs = [examples[x] for x in exIndices] vals = [x[best] for x in exs] sortIdx = numpy.argsort(vals) sortVals = [exs[x] for x in sortIdx] sortResults = [resCodes[x] for x in sortIdx] for i in range(len(vals)): print(' ', i, ['%.4f' % x for x in sortVals[i][1:-1]], sortResults[i]) return best, bestGain, bestBounds
def GenRandomExamples(nVars=10, randScale=0.3, bitProb=0.5, nExamples=500, seed=(0, 0), addResults=1): random.seed(seed[0]) varWeights = numpy.array([random.random() for _ in range(nVars)]) * randScale examples = [None] * nExamples for i in range(nExamples): varVals = [random.random() > bitProb for _ in range(nVars)] temp = numpy.array(varVals) * varWeights res = sum(temp) if addResults: varVals.append(res >= 1.) examples[i] = varVals nPossibleVals = [2] * (nExamples + 1) attrs = list(range(nVars)) return (examples, attrs, nPossibleVals) if __name__ == '__main__': # pragma: nocover from rdkit.six.moves import cPickle examples, attrs, nPossibleVals = GenRandomExamples() outF = open('random.dat.pkl', 'wb+') cPickle.dump(examples, outF) cPickle.dump(attrs, outF) cPickle.dump(nPossibleVals, outF) tree = ID3.ID3Boot(examples, attrs, nPossibleVals) tree.Pickle('save.pkl')