Ejemplo n.º 1
0
    def test_discrete_missing(self):
        d = data.Table("zoo")
        with d.unlocked():
            d.Y[25] = float("nan")
            d[0][0] = float("nan")
        cont = contingency.Discrete(d, 0)
        assert_dist_equal(cont["amphibian"], [3, 0])
        assert_dist_equal(
            cont, [[3, 0], [20, 0], [13, 0], [4, 4], [10, 0], [2, 38], [5, 0]])
        np.testing.assert_almost_equal(cont.col_unknowns,
                                       [0, 0, 0, 0, 0, 1, 0])
        np.testing.assert_almost_equal(cont.row_unknowns, [1, 0])

        d = data.Table("zoo")
        with d.unlocked():
            d.Y[2] = float("nan")
            d[2]["predator"] = float("nan")
        cont = contingency.Discrete(d, "predator")
        assert_dist_equal(cont["fish"], [4, 8])
        assert_dist_equal(
            cont, [[1, 3], [11, 9], [4, 8], [7, 1], [2, 8], [19, 22], [1, 4]])
        np.testing.assert_almost_equal(cont.col_unknowns,
                                       [0, 0, 0, 0, 0, 0, 0])
        np.testing.assert_almost_equal(cont.row_unknowns, [0, 0])
        self.assertEqual(1, cont.unknowns)
Ejemplo n.º 2
0
    def test_compute_contingency_row_attribute_sparse(self):
        """
        Testing with sparse row variable since currently we do not test the
        situation when a row variable is sparse.
        """
        d = self.test9
        # make X sparse
        d.X = csr_matrix(d.X)
        var1, var2 = d.domain[0], d.domain[1]
        cont = contingency.Discrete(d, var1, var2)
        assert_dist_equal(
            cont,
            [[1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
        cont = contingency.Discrete(d, var2, var1)
        assert_dist_equal(cont,
                          [[1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 1]])

        d.X = csc_matrix(d.X)
        cont = contingency.Discrete(d, var1, var2)
        assert_dist_equal(
            cont,
            [[1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
        cont = contingency.Discrete(d, var2, var1)
        assert_dist_equal(cont,
                          [[1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 1]])
    def test_discrete_with_fallback(self):
        d = data.Table("zoo")
        d.Y[25] = None
        default = contingency.Discrete(d, 0)

        d._compute_contingency = Mock(side_effect=NotImplementedError)
        fallback = contingency.Discrete(d, 0)

        np.testing.assert_almost_equal(fallback, default)
        np.testing.assert_almost_equal(fallback.unknowns, default.unknowns)
        np.testing.assert_almost_equal(fallback.unknown_rows, default.unknown_rows)
Ejemplo n.º 4
0
    def test_sparse(self):
        d = self._construct_sparse()
        cont = contingency.Discrete(d, 5)
        assert_dist_equal(cont[0], [2, 0, 0])
        assert_dist_equal(cont["b"], [0, 1, 1])
        assert_dist_equal(cont[2], [1, 0, 0])

        cont = contingency.Continuous(d, 14)
        assert_dist_equal(cont[0], [[], []])
        assert_dist_equal(cont["b"], [[1], [1]])
        assert_dist_equal(cont[2], [[2], [1]])

        cont = contingency.Continuous(d, "c3")
        assert_dist_equal(cont[0], [[1.1], [1]])
        assert_dist_equal(cont["b"], [[1], [1]])
        assert_dist_equal(cont[2], [[], []])

        d[4].set_class(1)
        cont = contingency.Continuous(d, 13)
        assert_dist_equal(cont[0], [[], []])
        assert_dist_equal(cont["b"], [[1, 1.1], [1, 1]])
        assert_dist_equal(cont[2], [[], []])

        cont = contingency.Continuous(d, 12)
        assert_dist_equal(cont[0], [[], []])
        assert_dist_equal(cont["b"], [[], []])
        assert_dist_equal(cont[2], [[], []])
Ejemplo n.º 5
0
    def test_discrete(self):
        cont = contingency.Discrete(self.zoo, 0)
        np.testing.assert_almost_equal(cont["amphibian"], [4, 0])
        np.testing.assert_almost_equal(
            cont, [[4, 0], [20, 0], [13, 0], [4, 4], [10, 0], [2, 39], [5, 0]])

        cont = contingency.Discrete(self.zoo, "predator")
        np.testing.assert_almost_equal(cont["fish"], [4, 9])
        np.testing.assert_almost_equal(
            cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]])

        cont = contingency.Discrete(self.zoo, self.zoo.domain["predator"])
        np.testing.assert_almost_equal(cont["fish"], [4, 9])
        np.testing.assert_almost_equal(
            cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]])
        self.assertEqual(cont.unknown_rows, 0)
Ejemplo n.º 6
0
    def test_discrete(self):
        d = data.Table("zoo")
        cont = contingency.Discrete(d, 0)
        np.testing.assert_almost_equal(cont["amphibian"], [4, 0])
        np.testing.assert_almost_equal(
            cont, [[4, 0], [20, 0], [13, 0], [4, 4], [10, 0], [2, 39], [5, 0]])

        cont = contingency.Discrete(d, "predator")
        np.testing.assert_almost_equal(cont["fish"], [4, 9])
        np.testing.assert_almost_equal(
            cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]])

        cont = contingency.Discrete(d, d.domain["predator"])
        np.testing.assert_almost_equal(cont["fish"], [4, 9])
        np.testing.assert_almost_equal(
            cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]])
Ejemplo n.º 7
0
    def test_discrete(self):
        cont = contingency.Discrete(self.zoo, 0)
        assert_dist_equal(cont["amphibian"], [4, 0])
        assert_dist_equal(
            cont, [[4, 0], [20, 0], [13, 0], [4, 4], [10, 0], [2, 39], [5, 0]])

        cont = contingency.Discrete(self.zoo, "predator")
        assert_dist_equal(cont["fish"], [4, 9])
        assert_dist_equal(
            cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]])

        cont = contingency.Discrete(self.zoo, self.zoo.domain["predator"])
        assert_dist_equal(cont["fish"], [4, 9])
        assert_dist_equal(
            cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]])
        self.assertEqual(sum(cont.col_unknowns), 0)
        self.assertEqual(sum(cont.row_unknowns), 0)
Ejemplo n.º 8
0
    def test_discrete_with_fallback(self):
        d = data.Table("zoo")
        d.Y[25] = None
        d.Y[24] = None
        d.X[0, 0] = None
        d.X[24, 0] = None
        default = contingency.Discrete(d, 0)

        d._compute_contingency = Mock(side_effect=NotImplementedError)
        fallback = contingency.Discrete(d, 0)

        np.testing.assert_array_equal(np.asarray(fallback),
                                      np.asarray(default))
        np.testing.assert_array_equal(fallback.unknowns, default.unknowns)
        np.testing.assert_array_equal(fallback.row_unknowns,
                                      default.row_unknowns)
        np.testing.assert_array_equal(fallback.col_unknowns,
                                      default.col_unknowns)
Ejemplo n.º 9
0
 def test_array_with_unknowns(self):
     d = data.Table("zoo")
     d.Y[2] = float("nan")
     d.Y[6] = float("nan")
     d[2]["predator"] = float("nan")
     d[4]["predator"] = float("nan")
     cont = contingency.Discrete(d, "predator")
     assert_dist_equal(cont.array_with_unknowns,
                       [[1, 3, 0], [11, 9, 0], [4, 8, 0], [7, 1, 0],
                        [2, 8, 0], [18, 21, 1], [1, 4, 0], [1, 0, 1]])
Ejemplo n.º 10
0
 def __call__(self, feature, data):
     if not data.domain.class_var:
         raise ValueError("Data with class labels required.")
     elif not isinstance(data.domain.class_var, DiscreteVariable):
         raise ValueError("Data with discrete class labels required.")
     cont = contingency.Discrete(data, feature)
     instances_with_class = np.sum(
         distribution.Discrete(data, data.domain.class_var))
     return self.from_contingency(
         cont, 1. - np.sum(cont.unknowns) / instances_with_class)
Ejemplo n.º 11
0
 def _score_disc_bin():
     """Scoring for discrete attributes, with binarization"""
     n_values = len(attr.values)
     if n_values <= 2:
         return _score_disc()
     cont = contingency.Discrete(data, attr)
     attr_distr = np.sum(cont, axis=0)
     # Skip instances with missing value of the attribute
     cls_distr = np.sum(cont, axis=1)
     if np.sum(attr_distr) == 0:  # all values are missing
         return REJECT_ATTRIBUTE
     best_score, best_mapping = _tree_scorers.find_binarization_entropy(
         cont, cls_distr, attr_distr, self.min_samples_leaf)
     if best_score <= 0:
         return REJECT_ATTRIBUTE
     best_score *= 1 - np.sum(cont.unknowns) / len(data)
     mapping, branches = MappedDiscreteNode.branches_from_mapping(
         col_x, best_mapping, n_values)
     node = MappedDiscreteNode(attr, attr_no, mapping, None)
     return best_score, node, branches, 2
Ejemplo n.º 12
0
 def test_compute_contingency_metas(self):
     var1, var2 = self.test9.domain[-2], self.test9.domain[-4]
     cont = contingency.Discrete(self.test9, var1, var2)
     assert_dist_equal(cont, [[3, 0, 0], [0, 2, 0], [0, 0, 2], [0, 1, 0]])
Ejemplo n.º 13
0
 def test_deepcopy(self):
     cont = contingency.Discrete(self.zoo, 0)
     dc = copy.deepcopy(cont)
     self.assertEqual(dc, cont)
     self.assertEqual(dc.col_variable, cont.col_variable)
     self.assertEqual(dc.row_variable, cont.row_variable)
Ejemplo n.º 14
0
 def score_from_contingency(f):
     cont = contingency.Discrete(data, f)
     return self.from_contingency(
         cont, 1. - np.sum(cont.unknowns) / instances_with_class)
Ejemplo n.º 15
0
def _symmetrical_uncertainty(data, attr1, attr2):
    """Symmetrical uncertainty, Press et al., 1988."""
    cont = np.asarray(contingency.Discrete(data, attr1, attr2), dtype=float)
    ig = InfoGain().from_contingency(cont, 1)
    return 2 * ig / (_entropy(cont) + _entropy(cont.T))