Esempio n. 1
0
 def _score_disc_bin():
     n_values = len(attr.values)
     if n_values == 2:
         return _score_disc()
     score, mapping = _tree_scorers.find_binarization_MSE(
         col_x, col_y, n_values, self.min_samples_leaf)
     # The score is already adjusted for missing attribute values, so
     # we don't do it here
     if score == 0:
         return REJECT_ATTRIBUTE
     mapping, branches = MappedDiscreteNode.branches_from_mapping(
         data.X[:, attr_no], mapping, len(attr.values))
     node = MappedDiscreteNode(attr, attr_no, mapping, None)
     return score, node, branches, 2
Esempio n. 2
0
    def setUpClass(cls):
        super().setUpClass()
        WidgetOutputsTestMixin.init(cls)

        tree = TreeLearner()
        cls.model = tree(cls.data)
        cls.model.instances = cls.data

        cls.signal_name = "Tree"
        cls.signal_data = cls.model

        # Load a dataset that contains two variables with the same entropy
        data_same_entropy = Table(
            path.join(path.dirname(path.dirname(path.dirname(__file__))),
                      "tests", "datasets", "same_entropy.tab"))
        cls.data_same_entropy = tree(data_same_entropy)
        cls.data_same_entropy.instances = data_same_entropy

        vara = DiscreteVariable("aaa", values=("e", "f", "g"))
        root = DiscreteNode(vara, 0, np.array([42, 8]))
        root.subset = np.arange(50)

        varb = DiscreteVariable("bbb", values=tuple("ijkl"))
        child0 = MappedDiscreteNode(varb, 1, np.array([0, 1, 0, 0]), (38, 5))
        child0.subset = np.arange(16)
        child1 = Node(None, 0, (13, 3))
        child1.subset = np.arange(16, 30)
        varc = ContinuousVariable("ccc")
        child2 = NumericNode(varc, 2, 42, (78, 12))
        child2.subset = np.arange(30, 50)
        root.children = (child0, child1, child2)

        child00 = Node(None, 0, (15, 4))
        child00.subset = np.arange(10)
        child01 = Node(None, 0, (10, 5))
        child01.subset = np.arange(10, 16)
        child0.children = (child00, child01)

        child20 = Node(None, 0, (90, 4))
        child20.subset = np.arange(30, 35)
        child21 = Node(None, 0, (70, 9))
        child21.subset = np.arange(35, 50)
        child2.children = (child20, child21)

        domain = Domain([vara, varb, varc], ContinuousVariable("y"))
        t = [[i, j, k] for i in range(3) for j in range(4) for k in (40, 44)]
        x = np.array((t * 3)[:50])
        data = Table.from_numpy(domain, x, np.arange(len(x)))
        cls.tree = TreeModel(data, root)
Esempio n. 3
0
 def _score_disc_bin():
     """Scoring for discrete attributes, with binarization"""
     n_values = len(attr.values)
     if n_values <= 2:
         return _score_disc()
     cont = contingency.Discrete(data, attr)
     attr_distr = np.sum(cont, axis=0)
     # Skip instances with missing value of the attribute
     cls_distr = np.sum(cont, axis=1)
     if np.sum(attr_distr) == 0:  # all values are missing
         return REJECT_ATTRIBUTE
     best_score, best_mapping = _tree_scorers.find_binarization_entropy(
         cont, cls_distr, attr_distr, self.min_samples_leaf)
     if best_score <= 0:
         return REJECT_ATTRIBUTE
     best_score *= 1 - np.sum(cont.unknowns) / len(data)
     mapping, branches = MappedDiscreteNode.branches_from_mapping(
         col_x, best_mapping, n_values)
     node = MappedDiscreteNode(attr, attr_no, mapping, None)
     return best_score, node, branches, 2
Esempio n. 4
0
 def _score_disc_bin():
     n_values = len(attr.values)
     if n_values == 2:
         return _score_disc()
     score, mapping = _tree_scorers.find_binarization_MSE(
         col_x, col_y, n_values, self.min_samples_leaf)
     # The score is already adjusted for missing attribute values, so
     # we don't do it here
     if score == 0:
         return REJECT_ATTRIBUTE
     mapping, branches = MappedDiscreteNode.branches_from_mapping(
         col_x, mapping, len(attr.values))
     node = MappedDiscreteNode(attr, attr_no, mapping, None)
     return score, node, branches, 2
Esempio n. 5
0
 def _score_disc_bin():
     """Scoring for discrete attributes, with binarization"""
     n_values = len(attr.values)
     if n_values <= 2:
         return _score_disc()
     cont = contingency.Discrete(data, attr)
     attr_distr = np.sum(cont, axis=0)
     # Skip instances with missing value of the attribute
     cls_distr = np.sum(cont, axis=1)
     if np.sum(attr_distr) == 0:  # all values are missing
         return REJECT_ATTRIBUTE
     best_score, best_mapping = _tree_scorers.find_binarization_entropy(
         cont, cls_distr, attr_distr, self.min_samples_leaf)
     if best_score <= 0:
         return REJECT_ATTRIBUTE
     best_score *= 1 - np.sum(cont.unknowns) / len(data)
     mapping, branches = MappedDiscreteNode.branches_from_mapping(
         data.X[:, attr_no], best_mapping, n_values)
     node = MappedDiscreteNode(attr, attr_no, mapping, None)
     return best_score, node, branches, 2