def setUpClass(cls): super().setUpClass() WidgetOutputsTestMixin.init(cls) tree = TreeLearner() cls.model = tree(cls.data) cls.model.instances = cls.data cls.signal_name = "Tree" cls.signal_data = cls.model # Load a dataset that contains two variables with the same entropy data_same_entropy = Table( path.join(path.dirname(path.dirname(path.dirname(__file__))), "tests", "datasets", "same_entropy.tab")) cls.data_same_entropy = tree(data_same_entropy) cls.data_same_entropy.instances = data_same_entropy vara = DiscreteVariable("aaa", values=("e", "f", "g")) root = DiscreteNode(vara, 0, np.array([42, 8])) root.subset = np.arange(50) varb = DiscreteVariable("bbb", values=tuple("ijkl")) child0 = MappedDiscreteNode(varb, 1, np.array([0, 1, 0, 0]), (38, 5)) child0.subset = np.arange(16) child1 = Node(None, 0, (13, 3)) child1.subset = np.arange(16, 30) varc = ContinuousVariable("ccc") child2 = NumericNode(varc, 2, 42, (78, 12)) child2.subset = np.arange(30, 50) root.children = (child0, child1, child2) child00 = Node(None, 0, (15, 4)) child00.subset = np.arange(10) child01 = Node(None, 0, (10, 5)) child01.subset = np.arange(10, 16) child0.children = (child00, child01) child20 = Node(None, 0, (90, 4)) child20.subset = np.arange(30, 35) child21 = Node(None, 0, (70, 9)) child21.subset = np.arange(35, 50) child2.children = (child20, child21) domain = Domain([vara, varb, varc], ContinuousVariable("y")) t = [[i, j, k] for i in range(3) for j in range(4) for k in (40, 44)] x = np.array((t * 3)[:50]) data = Table.from_numpy(domain, x, np.arange(len(x))) cls.tree = TreeModel(data, root)
def fit_storage(self, data): if self.binarize and any( attr.is_discrete and len(attr.values) > self.MAX_BINARIZATION for attr in data.domain.attributes): # No fallback in the script; widgets can prevent this error # by providing a fallback and issue a warning about doing so raise ValueError("Exhaustive binarization does not handle " "attributes with more than {} values".format( self.MAX_BINARIZATION)) active_inst = np.nonzero(~np.isnan(data.Y))[0].astype(np.int32) root = self.build_tree(data, active_inst) if root is None: root = Node(None, 0, np.array([0., 0.])) root.subset = active_inst model = TreeModel(data, root) return model
def fit_storage(self, data): if self.binarize and any( attr.is_discrete and len(attr.values) > self.MAX_BINARIZATION for attr in data.domain.attributes): # No fallback in the script; widgets can prevent this error # by providing a fallback and issue a warning about doing so raise ValueError("Exhaustive binarization does not handle " "attributes with more than {} values". format(self.MAX_BINARIZATION)) active_inst = np.nonzero(~np.isnan(data.Y))[0].astype(np.int32) root = self.build_tree(data, active_inst) if root is None: root = Node(None, 0, np.array([0., 0.])) root.subset = active_inst model = TreeModel(data, root) return model
def build_tree(self, data, active_inst, level=1): """Induce a tree from the given data Returns: root node (Node)""" node_insts = data[active_inst] if len(node_insts) < self.min_samples_leaf: return None if len(node_insts) < self.min_samples_split or \ self.max_depth is not None and level > self.max_depth: node, branches, n_children = Node(None, None, None), None, 0 else: node, branches, n_children = self._select_attr(node_insts) mean, var = np.mean(node_insts.Y), np.var(node_insts.Y) node.value = np.array([mean, 1 if np.isnan(var) else var]) node.subset = active_inst if branches is not None: node.children = [ self.build_tree(data, active_inst[branches == br], level + 1) for br in range(n_children)] return node
def _build_tree(self, data, active_inst, level=1): """Induce a tree from the given data Returns: root node (Node)""" node_insts = data[active_inst] distr = distribution.Discrete(node_insts, data.domain.class_var) if len(node_insts) < self.min_samples_leaf: return None if len(node_insts) < self.min_samples_split or \ max(distr) >= sum(distr) * self.sufficient_majority or \ self.max_depth is not None and level > self.max_depth: node, branches, n_children = Node(None, None, distr), None, 0 else: node, branches, n_children = self._select_attr(node_insts) node.subset = active_inst if branches is not None: node.children = [ self._build_tree(data, active_inst[branches == br], level + 1) for br in range(n_children)] return node
def build_tree(self, data, active_inst, level=1): """Induce a tree from the given data Returns: root node (Node)""" node_insts = data[active_inst] if len(node_insts) < self.min_samples_leaf: return None if len(node_insts) < self.min_samples_split or \ self.max_depth is not None and level > self.max_depth: node, branches, n_children = Node(None, None, None), None, 0 else: node, branches, n_children = self._select_attr(node_insts) mean, var = np.mean(node_insts.Y), np.var(node_insts.Y) node.value = np.array([mean, 1 if np.isnan(var) else var]) node.subset = active_inst if branches is not None: node.children = [ self.build_tree(data, active_inst[branches == br], level + 1) for br in range(n_children)] return node
def build_tree(self, data, active_inst, level=1): """Induce a tree from the given data Returns: root node (Node)""" node_insts = data[active_inst] distr = distribution.Discrete(node_insts, data.domain.class_var) if len(node_insts) < self.min_samples_leaf: return None if len(node_insts) < self.min_samples_split or \ max(distr) >= sum(distr) * self.sufficient_majority or \ self.max_depth is not None and level > self.max_depth: node, branches, n_children = Node(None, None, distr), None, 0 else: node, branches, n_children = self._select_attr(node_insts) node.subset = active_inst if branches is not None: node.children = [ self.build_tree(data, active_inst[branches == br], level + 1) for br in range(n_children)] return node