Ejemplo n.º 1
0
    def setUpClass(cls):
        super().setUpClass()
        WidgetOutputsTestMixin.init(cls)

        tree = TreeLearner()
        cls.model = tree(cls.data)
        cls.model.instances = cls.data

        cls.signal_name = "Tree"
        cls.signal_data = cls.model

        # Load a dataset that contains two variables with the same entropy
        data_same_entropy = Table(
            path.join(path.dirname(path.dirname(path.dirname(__file__))),
                      "tests", "datasets", "same_entropy.tab"))
        cls.data_same_entropy = tree(data_same_entropy)
        cls.data_same_entropy.instances = data_same_entropy

        vara = DiscreteVariable("aaa", values=("e", "f", "g"))
        root = DiscreteNode(vara, 0, np.array([42, 8]))
        root.subset = np.arange(50)

        varb = DiscreteVariable("bbb", values=tuple("ijkl"))
        child0 = MappedDiscreteNode(varb, 1, np.array([0, 1, 0, 0]), (38, 5))
        child0.subset = np.arange(16)
        child1 = Node(None, 0, (13, 3))
        child1.subset = np.arange(16, 30)
        varc = ContinuousVariable("ccc")
        child2 = NumericNode(varc, 2, 42, (78, 12))
        child2.subset = np.arange(30, 50)
        root.children = (child0, child1, child2)

        child00 = Node(None, 0, (15, 4))
        child00.subset = np.arange(10)
        child01 = Node(None, 0, (10, 5))
        child01.subset = np.arange(10, 16)
        child0.children = (child00, child01)

        child20 = Node(None, 0, (90, 4))
        child20.subset = np.arange(30, 35)
        child21 = Node(None, 0, (70, 9))
        child21.subset = np.arange(35, 50)
        child2.children = (child20, child21)

        domain = Domain([vara, varb, varc], ContinuousVariable("y"))
        t = [[i, j, k] for i in range(3) for j in range(4) for k in (40, 44)]
        x = np.array((t * 3)[:50])
        data = Table.from_numpy(domain, x, np.arange(len(x)))
        cls.tree = TreeModel(data, root)
Ejemplo n.º 2
0
    def fit_storage(self, data):
        if self.binarize and any(
                attr.is_discrete and len(attr.values) > self.MAX_BINARIZATION
                for attr in data.domain.attributes):
            # No fallback in the script; widgets can prevent this error
            # by providing a fallback and issue a warning about doing so
            raise ValueError("Exhaustive binarization does not handle "
                             "attributes with more than {} values".format(
                                 self.MAX_BINARIZATION))

        active_inst = np.nonzero(~np.isnan(data.Y))[0].astype(np.int32)
        root = self.build_tree(data, active_inst)
        if root is None:
            root = Node(None, 0, np.array([0., 0.]))
        root.subset = active_inst
        model = TreeModel(data, root)
        return model
Ejemplo n.º 3
0
    def fit_storage(self, data):
        if self.binarize and any(
                attr.is_discrete and len(attr.values) > self.MAX_BINARIZATION
                for attr in data.domain.attributes):
            # No fallback in the script; widgets can prevent this error
            # by providing a fallback and issue a warning about doing so
            raise ValueError("Exhaustive binarization does not handle "
                             "attributes with more than {} values".
                             format(self.MAX_BINARIZATION))

        active_inst = np.nonzero(~np.isnan(data.Y))[0].astype(np.int32)
        root = self.build_tree(data, active_inst)
        if root is None:
            root = Node(None, 0, np.array([0., 0.]))
        root.subset = active_inst
        model = TreeModel(data, root)
        return model
Ejemplo n.º 4
0
    def build_tree(self, data, active_inst, level=1):
        """Induce a tree from the given data

        Returns:
            root node (Node)"""
        node_insts = data[active_inst]
        if len(node_insts) < self.min_samples_leaf:
            return None
        if len(node_insts) < self.min_samples_split or \
                self.max_depth is not None and level > self.max_depth:
            node, branches, n_children = Node(None, None, None), None, 0
        else:
            node, branches, n_children = self._select_attr(node_insts)
        mean, var = np.mean(node_insts.Y), np.var(node_insts.Y)
        node.value = np.array([mean, 1 if np.isnan(var) else var])
        node.subset = active_inst
        if branches is not None:
            node.children = [
                self.build_tree(data, active_inst[branches == br], level + 1)
                for br in range(n_children)]
        return node
Ejemplo n.º 5
0
    def _build_tree(self, data, active_inst, level=1):
        """Induce a tree from the given data

        Returns:
            root node (Node)"""
        node_insts = data[active_inst]
        distr = distribution.Discrete(node_insts, data.domain.class_var)
        if len(node_insts) < self.min_samples_leaf:
            return None
        if len(node_insts) < self.min_samples_split or \
                max(distr) >= sum(distr) * self.sufficient_majority or \
                self.max_depth is not None and level > self.max_depth:
            node, branches, n_children = Node(None, None, distr), None, 0
        else:
            node, branches, n_children = self._select_attr(node_insts)
        node.subset = active_inst
        if branches is not None:
            node.children = [
                self._build_tree(data, active_inst[branches == br], level + 1)
                for br in range(n_children)]
        return node
Ejemplo n.º 6
0
    def build_tree(self, data, active_inst, level=1):
        """Induce a tree from the given data

        Returns:
            root node (Node)"""
        node_insts = data[active_inst]
        if len(node_insts) < self.min_samples_leaf:
            return None
        if len(node_insts) < self.min_samples_split or \
                self.max_depth is not None and level > self.max_depth:
            node, branches, n_children = Node(None, None, None), None, 0
        else:
            node, branches, n_children = self._select_attr(node_insts)
        mean, var = np.mean(node_insts.Y), np.var(node_insts.Y)
        node.value = np.array([mean, 1 if np.isnan(var) else var])
        node.subset = active_inst
        if branches is not None:
            node.children = [
                self.build_tree(data, active_inst[branches == br], level + 1)
                for br in range(n_children)]
        return node
Ejemplo n.º 7
0
    def build_tree(self, data, active_inst, level=1):
        """Induce a tree from the given data

        Returns:
            root node (Node)"""
        node_insts = data[active_inst]
        distr = distribution.Discrete(node_insts, data.domain.class_var)
        if len(node_insts) < self.min_samples_leaf:
            return None
        if len(node_insts) < self.min_samples_split or \
                max(distr) >= sum(distr) * self.sufficient_majority or \
                self.max_depth is not None and level > self.max_depth:
            node, branches, n_children = Node(None, None, distr), None, 0
        else:
            node, branches, n_children = self._select_attr(node_insts)
        node.subset = active_inst
        if branches is not None:
            node.children = [
                self.build_tree(data, active_inst[branches == br], level + 1)
                for br in range(n_children)]
        return node