def setUpClass(cls): super().setUpClass() WidgetOutputsTestMixin.init(cls) tree = TreeLearner() cls.model = tree(cls.data) cls.model.instances = cls.data cls.signal_name = "Tree" cls.signal_data = cls.model # Load a dataset that contains two variables with the same entropy data_same_entropy = Table( path.join(path.dirname(path.dirname(path.dirname(__file__))), "tests", "datasets", "same_entropy.tab")) cls.data_same_entropy = tree(data_same_entropy) cls.data_same_entropy.instances = data_same_entropy vara = DiscreteVariable("aaa", values=("e", "f", "g")) root = DiscreteNode(vara, 0, np.array([42, 8])) root.subset = np.arange(50) varb = DiscreteVariable("bbb", values=tuple("ijkl")) child0 = MappedDiscreteNode(varb, 1, np.array([0, 1, 0, 0]), (38, 5)) child0.subset = np.arange(16) child1 = Node(None, 0, (13, 3)) child1.subset = np.arange(16, 30) varc = ContinuousVariable("ccc") child2 = NumericNode(varc, 2, 42, (78, 12)) child2.subset = np.arange(30, 50) root.children = (child0, child1, child2) child00 = Node(None, 0, (15, 4)) child00.subset = np.arange(10) child01 = Node(None, 0, (10, 5)) child01.subset = np.arange(10, 16) child0.children = (child00, child01) child20 = Node(None, 0, (90, 4)) child20.subset = np.arange(30, 35) child21 = Node(None, 0, (70, 9)) child21.subset = np.arange(35, 50) child2.children = (child20, child21) domain = Domain([vara, varb, varc], ContinuousVariable("y")) t = [[i, j, k] for i in range(3) for j in range(4) for k in (40, 44)] x = np.array((t * 3)[:50]) data = Table.from_numpy(domain, x, np.arange(len(x))) cls.tree = TreeModel(data, root)
def _score_disc(): n_values = len(attr.values) score = _tree_scorers.compute_grouped_MSE(col_x, col_y, n_values, self.min_samples_leaf) # The score is already adjusted for missing attribute values, so # we don't do it here if score == 0: return REJECT_ATTRIBUTE branches = col_x.flatten() branches[np.isnan(branches)] = -1 return score, DiscreteNode(attr, attr_no, None), branches, n_values
def _score_disc(): """Scoring for discrete attributes, no binarization The class computes the entropy itself, not by calling other functions. This is to make sure that it uses the same definition as the below classes that compute entropy themselves for efficiency reasons.""" n_values = len(attr.values) if n_values < 2: return REJECT_ATTRIBUTE cont = _tree_scorers.contingency( col_x, len(data.domain.attributes[attr_no].values), data.Y, len(data.domain.class_var.values), ) attr_distr = np.sum(cont, axis=0) null_nodes = attr_distr < self.min_samples_leaf # This is just for speed. If there is only a single non-null-node, # entropy wouldn't decrease anyway. if sum(null_nodes) >= n_values - 1: return REJECT_ATTRIBUTE cont[:, null_nodes] = 0 attr_distr = np.sum(cont, axis=0) cls_distr = np.sum(cont, axis=1) n = np.sum(attr_distr) # Avoid log(0); <= instead of == because we need an array cls_distr[cls_distr <= 0] = 1 attr_distr[attr_distr <= 0] = 1 cont[cont <= 0] = 1 class_entr = n * np.log(n) - np.sum(cls_distr * np.log(cls_distr)) attr_entr = np.sum(attr_distr * np.log(attr_distr)) cont_entr = np.sum(cont * np.log(cont)) score = (class_entr - attr_entr + cont_entr) / n / np.log(2) score *= n / len(data) # punishment for missing values branches = col_x branches[np.isnan(branches)] = -1 if score == 0: return REJECT_ATTRIBUTE node = DiscreteNode(attr, attr_no, None) return score, node, branches, n_values