Ejemplo n.º 1
0
 def task(self):
     listInst = dtree.load_csv_dataset(datadir("data.csv"))
     dt = dtree.build_tree(listInst[:-10])
     dtree.prune_tree(dt, listInst[-10:])
     return serialize_tree(dt)
Ejemplo n.º 2
0
    def test_prune_tree(self):
        """
        Test bottom-up pruning with a validation set.

        The test builds a random tree, then randomly chooses a node at which
        to prune. To induce pruning, the test does the following:
        - set the default label of the node to T
        - set the default label of the nodes, and actual label of the leaves,
          of all descendants to F
        - generate a large number of T instances that follow a path
          through the node
        - set the default labels of all ancestors of the node to F
        - prune the tree
        - repeat for the node's parent, continuing up to the root.
        """
        def set_labels(dtRoot,f):
            def down(dt):
                if dt.is_leaf():
                    dt.fLabel = f
                dt.fDefaultLabel = f
                map(down,dt.dictChildren.values())
            down(dtRoot)
        def check_passes(dtRoot,dtCheck,inst):
            def down(dt):
                assert not dt.is_leaf()
                assert len(dt.dictChildren) == cValue
                dt = dt.dictChildren[inst.listAttrs[dt.ixAttr]]
                if dt == dtCheck:
                    return
            down(dtRoot)

        cAttr = random.randint(2,4)
        cValue = random.randint(2,4)
        dtBase = build_random_tree(cAttr,cValue)
        listPath = []
        listAttrs = []
        listDt = []
        fTargetValue = True#randbool()
        set_labels(dtBase, not fTargetValue)
        dt = dtBase        
        while not dt.is_leaf():
            ixValue = random.choice(dt.dictChildren.keys())
            listPath.append(ixValue)
            listAttrs.append(dt.ixAttr)
            dt = dt.dictChildren[ixValue]

        while listPath:
            listPath.pop()
            dt = dtRoot = dtBase.copy()
            for ixValue in listPath:
                dt = dt.dictChildren[ixValue]
                assert dt.is_node()
            dt.fDefaultLabel = fTargetValue
            listInst = []
            fxnEnd = lambda: randlist(0,cValue-1,cAttr - len(listPath))
            for _ in xrange(random.randint(1,10)):
                listValue = listPath + fxnEnd()
                listInstAttr = [None for _ in xrange(cAttr)]
                assert len(listValue) == cAttr
                for ixValue,ixAttr in zip(listValue,listAttrs):
                    listInstAttr[ixAttr] = ixValue
                inst = dtree.Instance(listInstAttr, fTargetValue)
                check_passes(dtRoot,dt,inst)
                listInst.append(inst)
            dtree.prune_tree(dtRoot,listInst)
            dt = dtRoot
            for ix,ixValue in enumerate(listPath):
                assert dt.ixAttr == listAttrs[ix]
                self.assertTrue(dt.is_node(), str(dtRoot))
                self.assertTrue(ixValue in dt.dictChildren)
                dt = dt.dictChildren[ixValue]
            self.assertTrue(dt.is_leaf(), str(dt))
    def test_prune_tree(self):
        """
        Test bottom-up pruning with a validation set.

        The test builds a random tree, then randomly chooses a node at which
        to prune. To induce pruning, the test does the following:
        - set the default label of the node to T
        - set the default label of the nodes, and actual label of the leaves,
          of all descendants to F
        - generate a large number of T instances that follow a path
          through the node
        - set the default labels of all ancestors of the node to F
        - prune the tree
        - repeat for the node's parent, continuing up to the root.
        """

        def set_labels(dtRoot, f):
            def down(dt):
                if dt.is_leaf():
                    dt.fLabel = f
                dt.fDefaultLabel = f
                map(down, dt.dictChildren.values())
            down(dtRoot)

        def check_passes(dtRoot, dtCheck, inst):
            def down(dt):
                assert not dt.is_leaf()
                assert len(dt.dictChildren) == cValue
                dt = dt.dictChildren[inst.listAttrs[dt.ixAttr]]
                if dt == dtCheck:
                    return
            down(dtRoot)

        cAttr = random.randint(2, 4)
        cValue = random.randint(2, 4)
        dtBase = build_random_tree(cAttr, cValue)
        listPath = []
        listAttrs = []

        fTargetValue = True  # randbool()
        set_labels(dtBase, not fTargetValue)
        dt = dtBase
        while not dt.is_leaf():
            ixValue = random.choice(dt.dictChildren.keys())
            listPath.append(ixValue)
            listAttrs.append(dt.ixAttr)
            # print ixValue
            dt = dt.dictChildren[ixValue]
        # print "-----------------------"

        while listPath:
            listPath.pop()
            dt = dtRoot = dtBase
            for ixValue in listPath:
                # print ixValue
                dt = dt.dictChildren[ixValue]
                assert dt.is_node()
            # print "-----------------------------------"
            dt.fDefaultLabel = fTargetValue
            listInst = []
            fxnEnd = lambda: randlist(0, cValue - 1, cAttr - len(listPath))
            for _ in xrange(random.randint(1, 10)):
                listValue = listPath + fxnEnd()
                listInstAttr = [None for _ in xrange(cAttr)]
                assert len(listValue) == cAttr
                for ixValue, ixAttr in zip(listValue, listAttrs):
                    listInstAttr[ixAttr] = ixValue
                inst = dtree.Instance(listInstAttr, fTargetValue)
                check_passes(dtRoot, dt, inst)
                listInst.append(inst)
            dtree.prune_tree(dtRoot, listInst)
            dt = dtRoot
            for ix, ixValue in enumerate(listPath):
                assert dt.ixAttr == listAttrs[ix]
                self.assertTrue(dt.is_node(), str(dtRoot))
                self.assertTrue(ixValue in dt.dictChildren)
                dt = dt.dictChildren[ixValue]
            self.assertTrue(dt.is_leaf(), str(dt))
Ejemplo n.º 4
0
 def task(self):
     listInst = dtree.load_csv_dataset(datadir("data.csv"))
     dt = dtree.build_tree(listInst[:-10])
     dtree.prune_tree(dt, listInst[-10:])
     return serialize_tree(dt)