Example #1
0
  def testGithubIssue18(self):
    d = [0, 1, 2, 3, 4]
    a = [0, 0, 1, 1, 1]
    tpl = Quantize.FindVarMultQuantBounds(d, 1, a, 2)

    d2 = [(x, ) for x in d]
    self.assertRaises(ValueError, lambda: Quantize.FindVarMultQuantBounds(d2, 1, a, 2))
    self.assertRaises(ValueError, lambda: Quantize._FindStartPoints(d2, a, len(d2)))
Example #2
0
    def _computeQuantBounds(self):
        neg = len(self._trainingExamples)
        natr = len(self._attrs)

        # make a list of results and values
        allVals = numpy.zeros((neg, natr), 'd')
        res = []  # list of y values
        i = 0
        for eg in self._trainingExamples:
            res.append(eg[-1])
            j = 0
            for ai in self._attrs:
                val = eg[ai]
                allVals[i, j] = val
                j += 1
            i += 1

        # now loop over each of the columns and compute the bounds
        # the number of bounds is determined by the maximum info gain
        i = 0
        for ai in self._attrs:
            nbnds = self._qBounds[ai]
            if nbnds > 0:
                mbnds = []
                mgain = -1.0

                for j in range(1, nbnds + 1):
                    bnds, igain = Quantize.FindVarMultQuantBounds(
                        allVals[:, i], j, res, self._nClasses)
                    if (igain > mgain):
                        mbnds = bnds
                        mgain = igain
                self._QBoundVals[ai] = mbnds
            i += 1
Example #3
0
 def testMultSplit3(self):
   """  4 possible results
   """
   d = [(1.,0),
        (1.1,0),
        (1.2,0),
        (1.4,2),
        (1.4,2),
        (1.6,2),
        (2.,2),
        (2.1,1),
        (2.1,1),
        (2.1,1),
        (2.2,1),
        (2.3,1),
        (3.0,3),
        (3.1,3),
        (3.2,3),
        (3.3,3)]
   varValues = map(lambda x:x[0],d)
   resCodes = map(lambda x:x[1],d)
   nPossibleRes =4
   res = Quantize.FindVarMultQuantBounds(varValues,3,resCodes,nPossibleRes)
   target = ([1.30, 2.05, 2.65],1.97722)
   assert Quantize.feq(res[1],target[1],1e-4),\
          'InfoGain comparison failed: %s != %s'%(res[1],target[1])
   assert min(map(lambda x,y:Quantize.feq(x,y,1e-4),res[0],target[0]))==1,\
          'split bound comparison failed: %s != %s'%(res[0],target[0])
Example #4
0
def runIt(namesAndTypes,dbConnect,nBounds,resCol,typesToDo=['float']):
  results = map(lambda x:x[0],dbConnect.GetColumns(namesAndTypes[resCol][0]))
  nPossibleRes = max(results)+1
  for cName,cType in namesAndTypes:
    if cType in typesToDo:
      dList = map(lambda x:x[0],dbConnect.GetColumns(cName))
      qDat = Quantize.FindVarMultQuantBounds(dList,nBounds,results,nPossibleRes)
      print cName, qDat
Example #5
0
 def testMultSplit2(self):
   """ same test as testMultSplit1, but out of order
   """
   d = [(1., 0), (2.1, 1), (1.1, 0), (1.2, 0), (1.4, 2), (1.6, 2), (2., 2), (1.4, 2), (2.1, 1),
        (2.2, 1), (2.1, 1), (2.3, 1)]
   varValues, resCodes = zip(*d)
   nPossibleRes = 3
   res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes)
   target = ([1.3, 2.05], 1.55458)
   assert Quantize.feq(res[1],target[1],1e-4),\
          'InfoGain comparison failed: %s != %s'%(res[1],target[1])
   assert min(map(lambda x,y:Quantize.feq(x,y,1e-4),res[0],target[0]))==1,\
          'split bound comparison failed: %s != %s'%(res[0],target[0])
Example #6
0
 def testMultSplit5(self):
   """ dual valued, with an island, a bit noisy
   """
   d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 1), (1.4, 0), (1.6, 1), (2., 1), (2.1, 0), (2.1, 0),
        (2.1, 0), (2.2, 1), (2.3, 0)]
   varValues, resCodes = zip(*d)
   nPossibleRes = 2
   res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes, nPossibleRes)
   target = ([1.3, 2.05], .34707)
   assert Quantize.feq(res[1],target[1],1e-4),\
          'InfoGain comparison failed: %s != %s'%(res[1],target[1])
   assert min(map(lambda x,y:Quantize.feq(x,y,1e-4),res[0],target[0]))==1,\
          'split bound comparison failed: %s != %s'%(res[0],target[0])
Example #7
0
 def testMultSplit1_simple_dual(self):
     # """ simple dual split """
     d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 2), (1.4, 2), (1.6, 2),
          (2., 2), (2.1, 1), (2.1, 1), (2.1, 1), (2.2, 1), (2.3, 1)]
     varValues, resCodes = zip(*d)
     nPossibleRes = 3
     res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes,
                                           nPossibleRes)
     target = ([1.3, 2.05], 1.55458)
     self.assertEqual(
         min(map(lambda x, y: Quantize.feq(x, y, 1e-4), res[0], target[0])),
         1, 'split bound comparison failed: %s != %s' % (res[0], target[0]))
     self.assertTrue(
         Quantize.feq(res[1], target[1], 1e-4),
         'InfoGain comparison failed: %s != %s' % (res[1], target[1]))
Example #8
0
 def testMultSplit4_dualValued_island(self):
     # """ dual valued, with an island """
     d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 1), (1.4, 1), (1.6, 1),
          (2., 1), (2.1, 0), (2.1, 0), (2.1, 0), (2.2, 0), (2.3, 0)]
     varValues, resCodes = zip(*d)
     nPossibleRes = 2
     res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes,
                                           nPossibleRes)
     target = ([1.3, 2.05], .91830)
     self.assertTrue(
         Quantize.feq(res[1], target[1], 1e-4),
         'InfoGain comparison failed: %s != %s' % (res[1], target[1]))
     self.assertEqual(
         min(map(lambda x, y: Quantize.feq(x, y, 1e-4), res[0], target[0])),
         1, 'split bound comparison failed: %s != %s' % (res[0], target[0]))
Example #9
0
 def testMultSplit2_outOfOrder(self):
     # """ same test as testMultSplit1, but out of order """
     d = [(1., 0), (2.1, 1), (1.1, 0), (1.2, 0), (1.4, 2), (1.6, 2),
          (2., 2), (1.4, 2), (2.1, 1), (2.2, 1), (2.1, 1), (2.3, 1)]
     varValues, resCodes = zip(*d)
     nPossibleRes = 3
     res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes,
                                           nPossibleRes)
     target = ([1.3, 2.05], 1.55458)
     self.assertTrue(
         Quantize.feq(res[1], target[1], 1e-4),
         'InfoGain comparison failed: %s != %s' % (res[1], target[1]))
     self.assertEqual(
         min([Quantize.feq(x, y, 1e-4) for x, y in zip(res[0], target[0])]),
         1, 'split bound comparison failed: %s != %s' % (res[0], target[0]))
Example #10
0
 def testMultSplit5_dualValued_island_noisy(self):
     # """ dual valued, with an island, a bit noisy """
     d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 1), (1.4, 0), (1.6, 1),
          (2., 1), (2.1, 0), (2.1, 0), (2.1, 0), (2.2, 1), (2.3, 0)]
     varValues, resCodes = zip(*d)
     nPossibleRes = 2
     res = Quantize.FindVarMultQuantBounds(varValues, 2, resCodes,
                                           nPossibleRes)
     target = ([1.3, 2.05], .34707)
     self.assertTrue(
         Quantize.feq(res[1], target[1], 1e-4),
         'InfoGain comparison failed: %s != %s' % (res[1], target[1]))
     self.assertEqual(
         min([Quantize.feq(x, y, 1e-4) for x, y in zip(res[0], target[0])]),
         1, 'split bound comparison failed: %s != %s' % (res[0], target[0]))
Example #11
0
 def testMultSplit3_4results(self):
     # """  4 possible results """
     d = [(1., 0), (1.1, 0), (1.2, 0), (1.4, 2), (1.4, 2), (1.6, 2),
          (2., 2), (2.1, 1), (2.1, 1), (2.1, 1), (2.2, 1), (2.3, 1),
          (3.0, 3), (3.1, 3), (3.2, 3), (3.3, 3)]
     varValues, resCodes = zip(*d)
     nPossibleRes = 4
     res = Quantize.FindVarMultQuantBounds(varValues, 3, resCodes,
                                           nPossibleRes)
     target = ([1.30, 2.05, 2.65], 1.97722)
     self.assertTrue(
         Quantize.feq(res[1], target[1], 1e-4),
         'InfoGain comparison failed: %s != %s' % (res[1], target[1]))
     self.assertEqual(
         min(map(lambda x, y: Quantize.feq(x, y, 1e-4), res[0], target[0])),
         1, 'split bound comparison failed: %s != %s' % (res[0], target[0]))
Example #12
0
def QuantTreeBoot(examples,
                  attrs,
                  nPossibleVals,
                  nBoundsPerVar,
                  initialVar=None,
                  maxDepth=-1,
                  **kwargs):
    """ Bootstrapping code for the QuantTree

    If _initialVar_ is not set, the algorithm will automatically
     choose the first variable in the tree (the standard greedy
     approach).  Otherwise, _initialVar_ will be used as the first
     split.

  """
    attrs = list(attrs)
    for i in range(len(nBoundsPerVar)):
        if nBoundsPerVar[i] == -1 and i in attrs:
            attrs.remove(i)

    tree = QuantTree.QuantTreeNode(None, 'node')
    nPossibleRes = nPossibleVals[-1]
    tree._nResultCodes = nPossibleRes

    resCodes = [int(x[-1]) for x in examples]
    counts = [0] * nPossibleRes
    for res in resCodes:
        counts[res] += 1
    if initialVar is None:
        best, gainHere, qBounds = FindBest(resCodes, examples, nBoundsPerVar,
                                           nPossibleRes, nPossibleVals, attrs,
                                           **kwargs)
    else:
        best = initialVar
        if nBoundsPerVar[best] > 0:
            vTable = map(lambda x, z=best: x[z], examples)
            qBounds, gainHere = Quantize.FindVarMultQuantBounds(
                vTable, nBoundsPerVar[best], resCodes, nPossibleRes)
        elif nBoundsPerVar[best] == 0:
            vTable = ID3.GenVarTable(examples, nPossibleVals, [best])[0]
            gainHere = entropy.InfoGain(vTable)
            qBounds = []
        else:
            gainHere = -1e6
            qBounds = []

    tree.SetName('Var: %d' % (best))
    tree.SetData(gainHere)
    tree.SetLabel(best)
    tree.SetTerminal(0)
    tree.SetQuantBounds(qBounds)
    nextAttrs = list(attrs)
    if not kwargs.get('recycleVars', 0):
        nextAttrs.remove(best)

    indices = list(range(len(examples)))
    if len(qBounds) > 0:
        for bound in qBounds:
            nextExamples = []
            for index in list(indices):
                ex = examples[index]
                if ex[best] < bound:
                    nextExamples.append(ex)
                    indices.remove(index)

            if len(nextExamples):
                tree.AddChildNode(
                    BuildQuantTree(nextExamples,
                                   best,
                                   nextAttrs,
                                   nPossibleVals,
                                   nBoundsPerVar,
                                   depth=1,
                                   maxDepth=maxDepth,
                                   **kwargs))
            else:
                v = numpy.argmax(counts)
                tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
        # add the last points remaining
        nextExamples = []
        for index in indices:
            nextExamples.append(examples[index])
        if len(nextExamples) != 0:
            tree.AddChildNode(
                BuildQuantTree(nextExamples,
                               best,
                               nextAttrs,
                               nPossibleVals,
                               nBoundsPerVar,
                               depth=1,
                               maxDepth=maxDepth,
                               **kwargs))
        else:
            v = numpy.argmax(counts)
            tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
    else:
        for val in range(nPossibleVals[best]):
            nextExamples = []
            for example in examples:
                if example[best] == val:
                    nextExamples.append(example)
            if len(nextExamples) != 0:
                tree.AddChildNode(
                    BuildQuantTree(nextExamples,
                                   best,
                                   nextAttrs,
                                   nPossibleVals,
                                   nBoundsPerVar,
                                   depth=1,
                                   maxDepth=maxDepth,
                                   **kwargs))
            else:
                v = numpy.argmax(counts)
                tree.AddChild('%d??' % (v), label=v, data=0.0, isTerminal=1)
    return tree
Example #13
0
def FindBest(resCodes,
             examples,
             nBoundsPerVar,
             nPossibleRes,
             nPossibleVals,
             attrs,
             exIndices=None,
             **kwargs):
    bestGain = -1e6
    best = -1
    bestBounds = []

    if exIndices is None:
        exIndices = list(range(len(examples)))

    if not len(exIndices):
        return best, bestGain, bestBounds

    nToTake = kwargs.get('randomDescriptors', 0)
    if nToTake > 0:
        nAttrs = len(attrs)
        if nToTake < nAttrs:
            ids = list(range(nAttrs))
            random.shuffle(ids, random=random.random)
            tmp = [attrs[x] for x in ids[:nToTake]]
            attrs = tmp

    for var in attrs:
        nBounds = nBoundsPerVar[var]
        if nBounds > 0:
            # vTable = map(lambda x,z=var:x[z],examples)
            try:
                vTable = [examples[x][var] for x in exIndices]
            except IndexError:
                print('index error retrieving variable: %d' % var)
                raise
            qBounds, gainHere = Quantize.FindVarMultQuantBounds(
                vTable, nBounds, resCodes, nPossibleRes)
            # print('\tvar:',var,qBounds,gainHere)
        elif nBounds == 0:
            vTable = ID3.GenVarTable((examples[x] for x in exIndices),
                                     nPossibleVals, [var])[0]
            gainHere = entropy.InfoGain(vTable)
            qBounds = []
        else:
            gainHere = -1e6
            qBounds = []
        if gainHere > bestGain:
            bestGain = gainHere
            bestBounds = qBounds
            best = var
        elif bestGain == gainHere:
            if len(qBounds) < len(bestBounds):
                best = var
                bestBounds = qBounds
    if best == -1:
        print('best unaltered')
        print('\tattrs:', attrs)
        print('\tnBounds:', numpy.take(nBoundsPerVar, attrs))
        print('\texamples:')
        for example in (examples[x] for x in exIndices):
            print('\t\t', example)

    if 0:
        print('BEST:', len(exIndices), best, bestGain, bestBounds)
        if (len(exIndices) < 10):
            print(len(exIndices), len(resCodes), len(examples))
            exs = [examples[x] for x in exIndices]
            vals = [x[best] for x in exs]
            sortIdx = numpy.argsort(vals)
            sortVals = [exs[x] for x in sortIdx]
            sortResults = [resCodes[x] for x in sortIdx]
            for i in range(len(vals)):
                print('   ', i, ['%.4f' % x for x in sortVals[i][1:-1]],
                      sortResults[i])
    return best, bestGain, bestBounds