def testMultSplit3(self): """ 4 possible results """ d = [(1.,0), (1.1,0), (1.2,0), (1.4,2), (1.4,2), (1.6,2), (2.,2), (2.1,1), (2.1,1), (2.1,1), (2.2,1), (2.3,1), (3.0,3), (3.1,3), (3.2,3), (3.3,3)] varValues = map(lambda x:x[0],d) resCodes = map(lambda x:x[1],d) nPossibleRes =4 res = Quantize.FindVarMultQuantBounds(varValues,3,resCodes,nPossibleRes) target = ([1.30, 2.05, 2.65],1.97722) assert Quantize.feq(res[1],target[1],1e-4),\ 'InfoGain comparison failed: %s != %s'%(res[1],target[1]) assert min(map(lambda x,y:Quantize.feq(x,y,1e-4),res[0],target[0]))==1,\ 'split bound comparison failed: %s != %s'%(res[0],target[0])
def testGithubIssue18(self): d = [0, 1, 2, 3, 4] a = [0, 0, 1, 1, 1] tpl = Quantize.FindVarMultQuantBounds(d, 1, a, 2) d2 = [(x, ) for x in d] self.assertRaises(ValueError, lambda: Quantize.FindVarMultQuantBounds(d2, 1, a, 2)) self.assertRaises(ValueError, lambda: Quantize._FindStartPoints(d2, a, len(d2)))
def testOneSplit3(self): # """ optimal division not possibe """ d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 2), (1.4, 2), (1.6, 2), (2., 2), (2.1, 1), (2.2, 1), (2.3, 1)] varValues, resCodes = zip(*d) nPossibleRes = 3 res = Quantize.FindVarQuantBound(varValues, resCodes, nPossibleRes) target = (1.3, 0.88129) self.assertEqual( [Quantize.feq(x, y, 1e-4) for x, y in zip(res, target)], [1, 1], 'result comparison failed: %s != %s' % (res, target))
def testOneSplit2_noise(self): # """ some noise """ d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 0), (1.4, 1), (1.6, 0), (2., 1), (2.1, 1), (2.2, 1), (2.3, 1)] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize.FindVarQuantBound(varValues, resCodes, nPossibleRes) target = (1.8, 0.60999) self.assertEqual( list(map(lambda x, y: Quantize.feq(x, y, 1e-4), res, target)), [1, 1], 'result comparison failed: %s != %s' % (res, target))
def testOneSplit1(self): # """ simple case (clear division) """ d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 0), (1.4, 0), (1.6, 0), (2., 1), (2.1, 1), (2.2, 1), (2.3, 1)] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize.FindVarQuantBound(varValues, resCodes, nPossibleRes) target = (1.8, 0.97095) self.assertEqual( [Quantize.feq(x, y, 1e-4) for x, y in zip(res, target)], [1, 1], 'result comparison failed: %s != %s' % (res, target))
def testOneSplit3(self): """ optimal division not possibe """ d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 2), (1.4, 2), (1.6, 2), (2., 2), (2.1, 1), (2.2, 1), (2.3, 1)] varValues, resCodes = zip(*d) nPossibleRes = 3 res = Quantize.FindVarQuantBound(varValues, resCodes, nPossibleRes) target = (1.3, 0.88129) assert list(map(lambda x,y:Quantize.feq(x,y,1e-4),res,target))==[1,1],\ 'result comparison failed: %s != %s'%(res,target)
def testOneSplit4(self): """ lots of duplicates """ d = [(1., 0), (1.1, 0), (1.2, 0), (1.2, 1), (1.4, 0), (1.4, 0), (1.6, 0), (2., 1), (2.1, 1), (2.1, 1), (2.1, 1), (2.1, 1), (2.2, 1), (2.3, 1)] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize.FindVarQuantBound(varValues, resCodes, nPossibleRes) target = (1.8, 0.68939) assert list(map(lambda x,y:Quantize.feq(x,y,1e-4),res,target))==[1,1],\ 'result comparison failed: %s != %s'%(res,target)
def testOneSplit5(self): """ same as testOneSplit1 data, but out of order """ d = [(1., 0), (1.1, 0), (2.2, 1), (1.2, 0), (1.6, 0), (1.4, 0), (2., 1), (2.1, 1), (1.4, 0), (2.3, 1)] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize.FindVarQuantBound(varValues, resCodes, nPossibleRes) target = (1.8, 0.97095) assert list(map(lambda x,y:Quantize.feq(x,y,1e-4),res,target))==[1,1],\ 'result comparison failed: %s != %s'%(res,target)
def testOneSplit5_outOfOrder(self): # """ same as testOneSplit1 data, but out of order """ d = [(1., 0), (1.1, 0), (2.2, 1), (1.2, 0), (1.6, 0), (1.4, 0), (2., 1), (2.1, 1), (1.4, 0), (2.3, 1)] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize.FindVarQuantBound(varValues, resCodes, nPossibleRes) target = (1.8, 0.97095) self.assertEqual( [Quantize.feq(x, y, 1e-4) for x, y in zip(res, target)], [1, 1], 'result comparison failed: %s != %s' % (res, target))
def testOneSplit4_duplicates(self): # """ lots of duplicates """ d = [(1., 0), (1.1, 0), (1.2, 0), (1.2, 1), (1.4, 0), (1.4, 0), (1.6, 0), (2., 1), (2.1, 1), (2.1, 1), (2.1, 1), (2.1, 1), (2.2, 1), (2.3, 1)] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize.FindVarQuantBound(varValues, resCodes, nPossibleRes) target = (1.8, 0.68939) self.assertEqual( [Quantize.feq(x, y, 1e-4) for x, y in zip(res, target)], [1, 1], 'result comparison failed: %s != %s' % (res, target))
def testMultSplit2(self): """ same test as testMultSplit1, but out of order """ d = [(1., 0), (2.1, 1), (1.1, 0), (1.2, 0), (1.4, 2), (1.6, 2), (2., 2), (1.4, 2), (2.1, 1), (2.2, 1), (2.1, 1), (2.3, 1)] varValues, resCodes = zip(*d) nPossibleRes = 3 res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes) target = ([1.3, 2.05], 1.55458) assert Quantize.feq(res[1],target[1],1e-4),\ 'InfoGain comparison failed: %s != %s'%(res[1],target[1]) assert min(map(lambda x,y:Quantize.feq(x,y,1e-4),res[0],target[0]))==1,\ 'split bound comparison failed: %s != %s'%(res[0],target[0])
def testMultSplit5(self): """ dual valued, with an island, a bit noisy """ d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 1), (1.4, 0), (1.6, 1), (2., 1), (2.1, 0), (2.1, 0), (2.1, 0), (2.2, 1), (2.3, 0)] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes) target = ([1.3, 2.05], .34707) assert Quantize.feq(res[1],target[1],1e-4),\ 'InfoGain comparison failed: %s != %s'%(res[1],target[1]) assert min(map(lambda x,y:Quantize.feq(x,y,1e-4),res[0],target[0]))==1,\ 'split bound comparison failed: %s != %s'%(res[0],target[0])
def testMultSplit3_4results(self): # """ 4 possible results """ d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 2), (1.4, 2), (1.6, 2), (2., 2), (2.1, 1), (2.1, 1), (2.1, 1), (2.2, 1), (2.3, 1), (3.0, 3), (3.1, 3), (3.2, 3), (3.3, 3)] varValues, resCodes = zip(*d) nPossibleRes = 4 res = Quantize.FindVarMultQuantBounds(varValues, 3, resCodes, nPossibleRes) target = ([1.30, 2.05, 2.65], 1.97722) self.assertTrue( Quantize.feq(res[1], target[1], 1e-4), 'InfoGain comparison failed: %s != %s' % (res[1], target[1])) self.assertEqual( min(map(lambda x, y: Quantize.feq(x, y, 1e-4), res[0], target[0])), 1, 'split bound comparison failed: %s != %s' % (res[0], target[0]))
def testMultSplit1_simple_dual(self): # """ simple dual split """ d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 2), (1.4, 2), (1.6, 2), (2., 2), (2.1, 1), (2.1, 1), (2.1, 1), (2.2, 1), (2.3, 1)] varValues, resCodes = zip(*d) nPossibleRes = 3 res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes) target = ([1.3, 2.05], 1.55458) self.assertEqual( min(map(lambda x, y: Quantize.feq(x, y, 1e-4), res[0], target[0])), 1, 'split bound comparison failed: %s != %s' % (res[0], target[0])) self.assertTrue( Quantize.feq(res[1], target[1], 1e-4), 'InfoGain comparison failed: %s != %s' % (res[1], target[1]))
def testMultSplit2_outOfOrder(self): # """ same test as testMultSplit1, but out of order """ d = [(1., 0), (2.1, 1), (1.1, 0), (1.2, 0), (1.4, 2), (1.6, 2), (2., 2), (1.4, 2), (2.1, 1), (2.2, 1), (2.1, 1), (2.3, 1)] varValues, resCodes = zip(*d) nPossibleRes = 3 res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes) target = ([1.3, 2.05], 1.55458) self.assertTrue( Quantize.feq(res[1], target[1], 1e-4), 'InfoGain comparison failed: %s != %s' % (res[1], target[1])) self.assertEqual( min([Quantize.feq(x, y, 1e-4) for x, y in zip(res[0], target[0])]), 1, 'split bound comparison failed: %s != %s' % (res[0], target[0]))
def testMultSplit5_dualValued_island_noisy(self): # """ dual valued, with an island, a bit noisy """ d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 1), (1.4, 0), (1.6, 1), (2., 1), (2.1, 0), (2.1, 0), (2.1, 0), (2.2, 1), (2.3, 0)] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes) target = ([1.3, 2.05], .34707) self.assertTrue( Quantize.feq(res[1], target[1], 1e-4), 'InfoGain comparison failed: %s != %s' % (res[1], target[1])) self.assertEqual( min([Quantize.feq(x, y, 1e-4) for x, y in zip(res[0], target[0])]), 1, 'split bound comparison failed: %s != %s' % (res[0], target[0]))
def testMultSplit4_dualValued_island(self): # """ dual valued, with an island """ d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 1), (1.4, 1), (1.6, 1), (2., 1), (2.1, 0), (2.1, 0), (2.1, 0), (2.2, 0), (2.3, 0)] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes) target = ([1.3, 2.05], .91830) self.assertTrue( Quantize.feq(res[1], target[1], 1e-4), 'InfoGain comparison failed: %s != %s' % (res[1], target[1])) self.assertEqual( min(map(lambda x, y: Quantize.feq(x, y, 1e-4), res[0], target[0])), 1, 'split bound comparison failed: %s != %s' % (res[0], target[0]))
def _computeQuantBounds(self): neg = len(self._trainingExamples) natr = len(self._attrs) # make a list of results and values allVals = numpy.zeros((neg, natr), 'd') res = [] # list of y values i = 0 for eg in self._trainingExamples: res.append(eg[-1]) j = 0 for ai in self._attrs: val = eg[ai] allVals[i, j] = val j += 1 i += 1 # now loop over each of the columns and compute the bounds # the number of bounds is determined by the maximum info gain i = 0 for ai in self._attrs: nbnds = self._qBounds[ai] if nbnds > 0: mbnds = [] mgain = -1.0 for j in range(1, nbnds + 1): bnds, igain = Quantize.FindVarMultQuantBounds( allVals[:, i], j, res, self._nClasses) if (igain > mgain): mbnds = bnds mgain = igain self._QBoundVals[ai] = mbnds i += 1
def runIt(namesAndTypes,dbConnect,nBounds,resCol,typesToDo=['float']): results = map(lambda x:x[0],dbConnect.GetColumns(namesAndTypes[resCol][0])) nPossibleRes = max(results)+1 for cName,cType in namesAndTypes: if cType in typesToDo: dList = map(lambda x:x[0],dbConnect.GetColumns(cName)) qDat = Quantize.FindVarMultQuantBounds(dList,nBounds,results,nPossibleRes) print cName, qDat
def testOneSplit2(self): """ some noise """ d = [(1.,0), (1.1,0), (1.2,0), (1.4,0), (1.4,1), (1.6,0), (2.,1), (2.1,1), (2.2,1), (2.3,1)] varValues = map(lambda x:x[0],d) resCodes = map(lambda x:x[1],d) nPossibleRes = 2 res = Quantize.FindVarQuantBound(varValues,resCodes,nPossibleRes) target = (1.8,0.60999) assert map(lambda x,y:Quantize.feq(x,y,1e-4),res,target)==[1,1],\ 'result comparison failed: %s != %s'%(res,target)
def test9NewSplits(self): """ """ d = [(0, 0), (1, 1), (2, 0), ] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize._NewPyFindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [1, 2], str(res)) res = Quantize._FindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [1, 2], str(res)) d = [(0, 1), (1, 0), (2, 1), ] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize._NewPyFindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [1, 2], str(res)) res = Quantize._FindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [1, 2], str(res)) d = [(0, 0), (0, 0), (1, 1), (1, 1), (2, 0), (2, 1), ] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize._NewPyFindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [2, 4], str(res)) res = Quantize._FindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [2, 4], str(res)) d = [(0, 0), (0, 1), (1, 1), (1, 1), (2, 0), (2, 1), ] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize._NewPyFindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [2, 4], str(res)) res = Quantize._FindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [2, 4], str(res)) d = [(0, 0), (0, 0), (1, 0), (1, 1), (2, 0), (2, 1), ] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize._NewPyFindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [2, 4], str(res)) res = Quantize._FindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [2, 4], str(res)) d = [(0, 0), (0, 0), (1, 0), (1, 0), (2, 1), (2, 1), ] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize._NewPyFindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [4], str(res)) res = Quantize._FindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [4], str(res)) d = [(0, 0), (0, 0), (1, 1), (1, 1), (2, 1), (2, 1), ] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize._NewPyFindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [2], str(res)) res = Quantize._FindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [2], str(res)) d = [(0, 0), (0, 0), (1, 0), (1, 0), (2, 0), (2, 0), ] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize._NewPyFindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [], str(res)) res = Quantize._FindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [], str(res)) d = [(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 0), ] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize._NewPyFindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [2, 4], str(res)) res = Quantize._FindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [2, 4], str(res)) d = [(1, 0), (2, 1), (2, 1), (3, 1), (3, 1), (3, 1), (4, 0), (4, 1), (4, 1), ] varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize._NewPyFindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [1, 6], str(res)) res = Quantize._FindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [1, 6], str(res)) d = [(1, 1.65175902843, 0), (2, 1.89935600758, 0), (3, 1.89935600758, 1), (4, 1.89935600758, 1), (5, 2.7561609745, 1), (6, 2.7561609745, 1), (7, 2.7561609745, 1), (8, 2.7561609745, 1), (9, 3.53454303741, 1), (10, 3.53454303741, 1), (11, 3.53454303741, 1), (12, 3.53454303741, 1), (13, 3.53454303741, 1)] _, varValues, resCodes = zip(*d) nPossibleRes = 2 res = Quantize._NewPyFindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [1, 4], str(res)) res = Quantize._FindStartPoints(varValues, resCodes, len(d)) self.assertTrue(res == [1, 4], str(res))
def QuantTreeBoot(examples, attrs, nPossibleVals, nBoundsPerVar, initialVar=None, maxDepth=-1, **kwargs): """ Bootstrapping code for the QuantTree If _initialVar_ is not set, the algorithm will automatically choose the first variable in the tree (the standard greedy approach). Otherwise, _initialVar_ will be used as the first split. """ attrs = list(attrs) for i in range(len(nBoundsPerVar)): if nBoundsPerVar[i] == -1 and i in attrs: attrs.remove(i) tree = QuantTree.QuantTreeNode(None, 'node') nPossibleRes = nPossibleVals[-1] tree._nResultCodes = nPossibleRes resCodes = [int(x[-1]) for x in examples] counts = [0] * nPossibleRes for res in resCodes: counts[res] += 1 if initialVar is None: best, gainHere, qBounds = FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, **kwargs) else: best = initialVar if nBoundsPerVar[best] > 0: vTable = map(lambda x, z=best: x[z], examples) qBounds, gainHere = Quantize.FindVarMultQuantBounds( vTable, nBoundsPerVar[best], resCodes, nPossibleRes) elif nBoundsPerVar[best] == 0: vTable = ID3.GenVarTable(examples, nPossibleVals, [best])[0] gainHere = entropy.InfoGain(vTable) qBounds = [] else: gainHere = -1e6 qBounds = [] tree.SetName('Var: %d' % (best)) tree.SetData(gainHere) tree.SetLabel(best) tree.SetTerminal(0) tree.SetQuantBounds(qBounds) nextAttrs = list(attrs) if not kwargs.get('recycleVars', 0): nextAttrs.remove(best) indices = list(range(len(examples))) if len(qBounds) > 0: for bound in qBounds: nextExamples = [] for index in list(indices): ex = examples[index] if ex[best] < bound: nextExamples.append(ex) indices.remove(index) if len(nextExamples): tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) # add the last points remaining nextExamples = [] for index in indices: nextExamples.append(examples[index]) if len(nextExamples) != 0: tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) else: for val in range(nPossibleVals[best]): nextExamples = [] for example in examples: if example[best] == val: nextExamples.append(example) if len(nextExamples) != 0: tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) return tree
def FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, exIndices=None, **kwargs): bestGain = -1e6 best = -1 bestBounds = [] if exIndices is None: exIndices = list(range(len(examples))) if not len(exIndices): return best, bestGain, bestBounds nToTake = kwargs.get('randomDescriptors', 0) if nToTake > 0: nAttrs = len(attrs) if nToTake < nAttrs: ids = list(range(nAttrs)) random.shuffle(ids, random=random.random) tmp = [attrs[x] for x in ids[:nToTake]] attrs = tmp for var in attrs: nBounds = nBoundsPerVar[var] if nBounds > 0: # vTable = map(lambda x,z=var:x[z],examples) try: vTable = [examples[x][var] for x in exIndices] except IndexError: print('index error retrieving variable: %d' % var) raise qBounds, gainHere = Quantize.FindVarMultQuantBounds( vTable, nBounds, resCodes, nPossibleRes) # print('\tvar:',var,qBounds,gainHere) elif nBounds == 0: vTable = ID3.GenVarTable((examples[x] for x in exIndices), nPossibleVals, [var])[0] gainHere = entropy.InfoGain(vTable) qBounds = [] else: gainHere = -1e6 qBounds = [] if gainHere > bestGain: bestGain = gainHere bestBounds = qBounds best = var elif bestGain == gainHere: if len(qBounds) < len(bestBounds): best = var bestBounds = qBounds if best == -1: print('best unaltered') print('\tattrs:', attrs) print('\tnBounds:', numpy.take(nBoundsPerVar, attrs)) print('\texamples:') for example in (examples[x] for x in exIndices): print('\t\t', example) if 0: print('BEST:', len(exIndices), best, bestGain, bestBounds) if (len(exIndices) < 10): print(len(exIndices), len(resCodes), len(examples)) exs = [examples[x] for x in exIndices] vals = [x[best] for x in exs] sortIdx = numpy.argsort(vals) sortVals = [exs[x] for x in sortIdx] sortResults = [resCodes[x] for x in sortIdx] for i in range(len(vals)): print(' ', i, ['%.4f' % x for x in sortVals[i][1:-1]], sortResults[i]) return best, bestGain, bestBounds