def CalcInfoGains(bitVects, actVals, nPossibleActs, nPossibleBitVals=2): """ Calculates the information gain for a set of points and activity values **Arguments** - bitVects: a *sequence* containing *IntVectors* - actVals: a *sequence* - nPossibleActs: the (integer) number of possible activity values. - nPossibleBitVals: (optional) if specified, this integer provides the maximum value attainable by the (increasingly inaccurately named) bits in _bitVects_. **Returns** a list of floats """ if len(bitVects) != len(actVals): raise ValueError('var and activity lists should be the same length') nBits = len(bitVects[0]) res = numpy.zeros(nBits, numpy.float) for bit in range(nBits): counts = FormCounts(bitVects, actVals, bit, nPossibleActs, nPossibleBitVals=nPossibleBitVals) res[bit] = entropy.InfoGain(counts) return res
def AnalyzeSparseVects(bitVects, actVals): """ #DOC **Arguments** - bitVects: a *sequence* containing SBVs - actVals: a *sequence* **Returns** a list of floats **Notes** - these need to be bit vects and binary activities """ nPts = len(bitVects) if nPts != len(actVals): raise ValueError, 'var and activity lists should be the same length' nBits = bitVects[0].GetSize() actives = numpy.zeros(nBits, numpy.integer) inactives = numpy.zeros(nBits, numpy.integer) nActives, nInactives = 0, 0 for i in range(nPts): sig, act = bitVects[i], actVals[i] onBitList = sig.GetOnBits() if act: for bit in onBitList: actives[bit] += 1 nActives += 1 else: for bit in onBitList: inactives[bit] += 1 nInactives += 1 resTbl = numpy.zeros((2, 2), numpy.integer) res = [] gains = [] counts = [] for bit in xrange(nBits): nAct, nInact = actives[bit], inactives[bit] if nAct or nInact: resTbl[0, 0] = nAct resTbl[1, 0] = nPts - nAct resTbl[0, 1] = nInact resTbl[1, 1] = nPts - nInact gain = entropy.InfoGain(resTbl) gains.append(gain) res.append((bit, gain, nAct, nInact)) return res, gains
def ID3Boot(examples, attrs, nPossibleVals, initialVar=None, depth=0, maxDepth=-1, **kwargs): """ Bootstrapping code for the ID3 algorithm see ID3 for descriptions of the arguments If _initialVar_ is not set, the algorithm will automatically choose the first variable in the tree (the standard greedy approach). Otherwise, _initialVar_ will be used as the first split. """ totEntropy = CalcTotalEntropy(examples, nPossibleVals) varTable = GenVarTable(examples, nPossibleVals, attrs) tree = DecTree.DecTreeNode(None, 'node') # tree.SetExamples(examples) tree._nResultCodes = nPossibleVals[-1] # <perl>you've got to love any language which will let you # do this much work in a single line :-)</perl> if initialVar is None: best = attrs[numpy.argmax([entropy.InfoGain(x) for x in varTable])] else: best = initialVar tree.SetName('Var: %d' % best) tree.SetData(totEntropy) tree.SetLabel(best) tree.SetTerminal(0) nextAttrs = list(attrs) if not kwargs.get('recycleVars', 0): nextAttrs.remove(best) for val in range(nPossibleVals[best]): nextExamples = [] for example in examples: if example[best] == val: nextExamples.append(example) tree.AddChildNode( ID3(nextExamples, best, nextAttrs, nPossibleVals, depth, maxDepth, **kwargs)) return tree
def QuantTreeBoot(examples, attrs, nPossibleVals, nBoundsPerVar, initialVar=None, maxDepth=-1, **kwargs): """ Bootstrapping code for the QuantTree If _initialVar_ is not set, the algorithm will automatically choose the first variable in the tree (the standard greedy approach). Otherwise, _initialVar_ will be used as the first split. """ attrs = list(attrs) for i in range(len(nBoundsPerVar)): if nBoundsPerVar[i] == -1 and i in attrs: attrs.remove(i) tree = QuantTree.QuantTreeNode(None, 'node') nPossibleRes = nPossibleVals[-1] tree._nResultCodes = nPossibleRes resCodes = [int(x[-1]) for x in examples] counts = [0] * nPossibleRes for res in resCodes: counts[res] += 1 if initialVar is None: best, gainHere, qBounds = FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, **kwargs) else: best = initialVar if nBoundsPerVar[best] > 0: vTable = map(lambda x, z=best: x[z], examples) qBounds, gainHere = Quantize.FindVarMultQuantBounds( vTable, nBoundsPerVar[best], resCodes, nPossibleRes) elif nBoundsPerVar[best] == 0: vTable = ID3.GenVarTable(examples, nPossibleVals, [best])[0] gainHere = entropy.InfoGain(vTable) qBounds = [] else: gainHere = -1e6 qBounds = [] tree.SetName('Var: %d' % (best)) tree.SetData(gainHere) tree.SetLabel(best) tree.SetTerminal(0) tree.SetQuantBounds(qBounds) nextAttrs = list(attrs) if not kwargs.get('recycleVars', 0): nextAttrs.remove(best) indices = list(range(len(examples))) if len(qBounds) > 0: for bound in qBounds: nextExamples = [] for index in list(indices): ex = examples[index] if ex[best] < bound: nextExamples.append(ex) indices.remove(index) if len(nextExamples): tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) # add the last points remaining nextExamples = [] for index in indices: nextExamples.append(examples[index]) if len(nextExamples) != 0: tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) else: for val in range(nPossibleVals[best]): nextExamples = [] for example in examples: if example[best] == val: nextExamples.append(example) if len(nextExamples) != 0: tree.AddChildNode( BuildQuantTree(nextExamples, best, nextAttrs, nPossibleVals, nBoundsPerVar, depth=1, maxDepth=maxDepth, **kwargs)) else: v = numpy.argmax(counts) tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1) return tree
def FindBest(resCodes, examples, nBoundsPerVar, nPossibleRes, nPossibleVals, attrs, exIndices=None, **kwargs): bestGain = -1e6 best = -1 bestBounds = [] if exIndices is None: exIndices = list(range(len(examples))) if not len(exIndices): return best, bestGain, bestBounds nToTake = kwargs.get('randomDescriptors', 0) if nToTake > 0: nAttrs = len(attrs) if nToTake < nAttrs: ids = list(range(nAttrs)) random.shuffle(ids, random=random.random) tmp = [attrs[x] for x in ids[:nToTake]] attrs = tmp for var in attrs: nBounds = nBoundsPerVar[var] if nBounds > 0: # vTable = map(lambda x,z=var:x[z],examples) try: vTable = [examples[x][var] for x in exIndices] except IndexError: print('index error retrieving variable: %d' % var) raise qBounds, gainHere = Quantize.FindVarMultQuantBounds( vTable, nBounds, resCodes, nPossibleRes) # print('\tvar:',var,qBounds,gainHere) elif nBounds == 0: vTable = ID3.GenVarTable((examples[x] for x in exIndices), nPossibleVals, [var])[0] gainHere = entropy.InfoGain(vTable) qBounds = [] else: gainHere = -1e6 qBounds = [] if gainHere > bestGain: bestGain = gainHere bestBounds = qBounds best = var elif bestGain == gainHere: if len(qBounds) < len(bestBounds): best = var bestBounds = qBounds if best == -1: print('best unaltered') print('\tattrs:', attrs) print('\tnBounds:', numpy.take(nBoundsPerVar, attrs)) print('\texamples:') for example in (examples[x] for x in exIndices): print('\t\t', example) if 0: print('BEST:', len(exIndices), best, bestGain, bestBounds) if (len(exIndices) < 10): print(len(exIndices), len(resCodes), len(examples)) exs = [examples[x] for x in exIndices] vals = [x[best] for x in exs] sortIdx = numpy.argsort(vals) sortVals = [exs[x] for x in sortIdx] sortResults = [resCodes[x] for x in sortIdx] for i in range(len(vals)): print(' ', i, ['%.4f' % x for x in sortVals[i][1:-1]], sortResults[i]) return best, bestGain, bestBounds
def _PyRecurseOnBounds(vals, cuts, which, starts, results, nPossibleRes, varTable=None): """ Primarily intended for internal use Recursively finds the best quantization boundaries **Arguments** - vals: a 1D Numeric array with the values of the variables, this should be sorted - cuts: a list with the indices of the quantization bounds (indices are into _starts_ ) - which: an integer indicating which bound is being adjusted here (and index into _cuts_ ) - starts: a list of potential starting points for quantization bounds - results: a 1D Numeric array of integer result codes - nPossibleRes: an integer with the number of possible result codes **Returns** - a 2-tuple containing: 1) the best information gain found so far 2) a list of the quantization bound indices ( _cuts_ for the best case) **Notes** - this is not even remotely efficient, which is why a C replacement was written """ nBounds = len(cuts) maxGain = -1e6 bestCuts = None highestCutHere = len(starts) - nBounds + which if varTable is None: varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes) while cuts[which] <= highestCutHere: varTable = _GenVarTable(vals, cuts, starts, results, nPossibleRes) gainHere = entropy.InfoGain(varTable) if gainHere > maxGain: maxGain = gainHere bestCuts = cuts[:] # recurse on the next vars if needed if which < nBounds - 1: gainHere, cutsHere = _RecurseOnBounds(vals, cuts[:], which + 1, starts, results, nPossibleRes, varTable=varTable) if gainHere > maxGain: maxGain = gainHere bestCuts = cutsHere # update this cut cuts[which] += 1 for i in range(which + 1, nBounds): if cuts[i] == cuts[i - 1]: cuts[i] += 1 return maxGain, bestCuts
def ID3(examples, target, attrs, nPossibleVals, depth=0, maxDepth=-1, **kwargs): """ Implements the ID3 algorithm for constructing decision trees. From Mitchell's book, page 56 This is *slightly* modified from Mitchell's book because it supports multivalued (non-binary) results. **Arguments** - examples: a list (nInstances long) of lists of variable values + instance values - target: an int - attrs: a list of ints indicating which variables can be used in the tree - nPossibleVals: a list containing the number of possible values of every variable. - depth: (optional) the current depth in the tree - maxDepth: (optional) the maximum depth to which the tree will be grown **Returns** a DecTree.DecTreeNode with the decision tree **NOTE:** This code cannot bootstrap (start from nothing...) use _ID3Boot_ (below) for that. """ varTable = GenVarTable(examples, nPossibleVals, attrs) tree = DecTree.DecTreeNode(None, 'node') # store the total entropy... in case that is interesting totEntropy = CalcTotalEntropy(examples, nPossibleVals) tree.SetData(totEntropy) # tree.SetExamples(examples) # the matrix of results for this target: tMat = GenVarTable(examples, nPossibleVals, [target])[0] # counts of each result code: counts = sum(tMat) nzCounts = numpy.nonzero(counts)[0] if len(nzCounts) == 1: # bottomed out because there is only one result code left # with any counts (i.e. there's only one type of example # left... this is GOOD!). res = nzCounts[0] tree.SetLabel(res) tree.SetName(str(res)) tree.SetTerminal(1) elif len(attrs) == 0 or (maxDepth >= 0 and depth >= maxDepth): # Bottomed out: no variables left or max depth hit # We don't really know what to do here, so # use the heuristic of picking the most prevalent # result v = numpy.argmax(counts) tree.SetLabel(v) tree.SetName('%d?' % v) tree.SetTerminal(1) else: # find the variable which gives us the largest information gain gains = [entropy.InfoGain(x) for x in varTable] best = attrs[numpy.argmax(gains)] # remove that variable from the lists of possible variables nextAttrs = attrs[:] if not kwargs.get('recycleVars', 0): nextAttrs.remove(best) # set some info at this node tree.SetName('Var: %d' % best) tree.SetLabel(best) # tree.SetExamples(examples) tree.SetTerminal(0) # loop over possible values of the new variable and # build a subtree for each one for val in range(nPossibleVals[best]): nextExamples = [] for example in examples: if example[best] == val: nextExamples.append(example) if len(nextExamples) == 0: # this particular value of the variable has no examples, # so there's not much sense in recursing. # This can (and does) happen. v = numpy.argmax(counts) tree.AddChild('%d' % v, label=v, data=0.0, isTerminal=1) else: # recurse tree.AddChildNode( ID3(nextExamples, best, nextAttrs, nPossibleVals, depth + 1, maxDepth, **kwargs)) return tree