def _score_disc_bin(): n_values = len(attr.values) if n_values == 2: return _score_disc() score, mapping = _tree_scorers.find_binarization_MSE( col_x, col_y, n_values, self.min_samples_leaf) # The score is already adjusted for missing attribute values, so # we don't do it here if score == 0: return REJECT_ATTRIBUTE mapping, branches = MappedDiscreteNode.branches_from_mapping( data.X[:, attr_no], mapping, len(attr.values)) node = MappedDiscreteNode(attr, attr_no, mapping, None) return score, node, branches, 2
def setUpClass(cls): super().setUpClass() WidgetOutputsTestMixin.init(cls) tree = TreeLearner() cls.model = tree(cls.data) cls.model.instances = cls.data cls.signal_name = "Tree" cls.signal_data = cls.model # Load a dataset that contains two variables with the same entropy data_same_entropy = Table( path.join(path.dirname(path.dirname(path.dirname(__file__))), "tests", "datasets", "same_entropy.tab")) cls.data_same_entropy = tree(data_same_entropy) cls.data_same_entropy.instances = data_same_entropy vara = DiscreteVariable("aaa", values=("e", "f", "g")) root = DiscreteNode(vara, 0, np.array([42, 8])) root.subset = np.arange(50) varb = DiscreteVariable("bbb", values=tuple("ijkl")) child0 = MappedDiscreteNode(varb, 1, np.array([0, 1, 0, 0]), (38, 5)) child0.subset = np.arange(16) child1 = Node(None, 0, (13, 3)) child1.subset = np.arange(16, 30) varc = ContinuousVariable("ccc") child2 = NumericNode(varc, 2, 42, (78, 12)) child2.subset = np.arange(30, 50) root.children = (child0, child1, child2) child00 = Node(None, 0, (15, 4)) child00.subset = np.arange(10) child01 = Node(None, 0, (10, 5)) child01.subset = np.arange(10, 16) child0.children = (child00, child01) child20 = Node(None, 0, (90, 4)) child20.subset = np.arange(30, 35) child21 = Node(None, 0, (70, 9)) child21.subset = np.arange(35, 50) child2.children = (child20, child21) domain = Domain([vara, varb, varc], ContinuousVariable("y")) t = [[i, j, k] for i in range(3) for j in range(4) for k in (40, 44)] x = np.array((t * 3)[:50]) data = Table.from_numpy(domain, x, np.arange(len(x))) cls.tree = TreeModel(data, root)
def _score_disc_bin(): """Scoring for discrete attributes, with binarization""" n_values = len(attr.values) if n_values <= 2: return _score_disc() cont = contingency.Discrete(data, attr) attr_distr = np.sum(cont, axis=0) # Skip instances with missing value of the attribute cls_distr = np.sum(cont, axis=1) if np.sum(attr_distr) == 0: # all values are missing return REJECT_ATTRIBUTE best_score, best_mapping = _tree_scorers.find_binarization_entropy( cont, cls_distr, attr_distr, self.min_samples_leaf) if best_score <= 0: return REJECT_ATTRIBUTE best_score *= 1 - np.sum(cont.unknowns) / len(data) mapping, branches = MappedDiscreteNode.branches_from_mapping( col_x, best_mapping, n_values) node = MappedDiscreteNode(attr, attr_no, mapping, None) return best_score, node, branches, 2