def test_discrete_missing(self): d = data.Table("zoo") with d.unlocked(): d.Y[25] = float("nan") d[0][0] = float("nan") cont = contingency.Discrete(d, 0) assert_dist_equal(cont["amphibian"], [3, 0]) assert_dist_equal( cont, [[3, 0], [20, 0], [13, 0], [4, 4], [10, 0], [2, 38], [5, 0]]) np.testing.assert_almost_equal(cont.col_unknowns, [0, 0, 0, 0, 0, 1, 0]) np.testing.assert_almost_equal(cont.row_unknowns, [1, 0]) d = data.Table("zoo") with d.unlocked(): d.Y[2] = float("nan") d[2]["predator"] = float("nan") cont = contingency.Discrete(d, "predator") assert_dist_equal(cont["fish"], [4, 8]) assert_dist_equal( cont, [[1, 3], [11, 9], [4, 8], [7, 1], [2, 8], [19, 22], [1, 4]]) np.testing.assert_almost_equal(cont.col_unknowns, [0, 0, 0, 0, 0, 0, 0]) np.testing.assert_almost_equal(cont.row_unknowns, [0, 0]) self.assertEqual(1, cont.unknowns)
def test_compute_contingency_row_attribute_sparse(self): """ Testing with sparse row variable since currently we do not test the situation when a row variable is sparse. """ d = self.test9 # make X sparse d.X = csr_matrix(d.X) var1, var2 = d.domain[0], d.domain[1] cont = contingency.Discrete(d, var1, var2) assert_dist_equal( cont, [[1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]) cont = contingency.Discrete(d, var2, var1) assert_dist_equal(cont, [[1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 1]]) d.X = csc_matrix(d.X) cont = contingency.Discrete(d, var1, var2) assert_dist_equal( cont, [[1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]) cont = contingency.Discrete(d, var2, var1) assert_dist_equal(cont, [[1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 1]])
def test_discrete_with_fallback(self): d = data.Table("zoo") d.Y[25] = None default = contingency.Discrete(d, 0) d._compute_contingency = Mock(side_effect=NotImplementedError) fallback = contingency.Discrete(d, 0) np.testing.assert_almost_equal(fallback, default) np.testing.assert_almost_equal(fallback.unknowns, default.unknowns) np.testing.assert_almost_equal(fallback.unknown_rows, default.unknown_rows)
def test_sparse(self): d = self._construct_sparse() cont = contingency.Discrete(d, 5) assert_dist_equal(cont[0], [2, 0, 0]) assert_dist_equal(cont["b"], [0, 1, 1]) assert_dist_equal(cont[2], [1, 0, 0]) cont = contingency.Continuous(d, 14) assert_dist_equal(cont[0], [[], []]) assert_dist_equal(cont["b"], [[1], [1]]) assert_dist_equal(cont[2], [[2], [1]]) cont = contingency.Continuous(d, "c3") assert_dist_equal(cont[0], [[1.1], [1]]) assert_dist_equal(cont["b"], [[1], [1]]) assert_dist_equal(cont[2], [[], []]) d[4].set_class(1) cont = contingency.Continuous(d, 13) assert_dist_equal(cont[0], [[], []]) assert_dist_equal(cont["b"], [[1, 1.1], [1, 1]]) assert_dist_equal(cont[2], [[], []]) cont = contingency.Continuous(d, 12) assert_dist_equal(cont[0], [[], []]) assert_dist_equal(cont["b"], [[], []]) assert_dist_equal(cont[2], [[], []])
def test_discrete(self): cont = contingency.Discrete(self.zoo, 0) np.testing.assert_almost_equal(cont["amphibian"], [4, 0]) np.testing.assert_almost_equal( cont, [[4, 0], [20, 0], [13, 0], [4, 4], [10, 0], [2, 39], [5, 0]]) cont = contingency.Discrete(self.zoo, "predator") np.testing.assert_almost_equal(cont["fish"], [4, 9]) np.testing.assert_almost_equal( cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]]) cont = contingency.Discrete(self.zoo, self.zoo.domain["predator"]) np.testing.assert_almost_equal(cont["fish"], [4, 9]) np.testing.assert_almost_equal( cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]]) self.assertEqual(cont.unknown_rows, 0)
def test_discrete(self): d = data.Table("zoo") cont = contingency.Discrete(d, 0) np.testing.assert_almost_equal(cont["amphibian"], [4, 0]) np.testing.assert_almost_equal( cont, [[4, 0], [20, 0], [13, 0], [4, 4], [10, 0], [2, 39], [5, 0]]) cont = contingency.Discrete(d, "predator") np.testing.assert_almost_equal(cont["fish"], [4, 9]) np.testing.assert_almost_equal( cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]]) cont = contingency.Discrete(d, d.domain["predator"]) np.testing.assert_almost_equal(cont["fish"], [4, 9]) np.testing.assert_almost_equal( cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]])
def test_discrete(self): cont = contingency.Discrete(self.zoo, 0) assert_dist_equal(cont["amphibian"], [4, 0]) assert_dist_equal( cont, [[4, 0], [20, 0], [13, 0], [4, 4], [10, 0], [2, 39], [5, 0]]) cont = contingency.Discrete(self.zoo, "predator") assert_dist_equal(cont["fish"], [4, 9]) assert_dist_equal( cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]]) cont = contingency.Discrete(self.zoo, self.zoo.domain["predator"]) assert_dist_equal(cont["fish"], [4, 9]) assert_dist_equal( cont, [[1, 3], [11, 9], [4, 9], [7, 1], [2, 8], [19, 22], [1, 4]]) self.assertEqual(sum(cont.col_unknowns), 0) self.assertEqual(sum(cont.row_unknowns), 0)
def test_discrete_with_fallback(self): d = data.Table("zoo") d.Y[25] = None d.Y[24] = None d.X[0, 0] = None d.X[24, 0] = None default = contingency.Discrete(d, 0) d._compute_contingency = Mock(side_effect=NotImplementedError) fallback = contingency.Discrete(d, 0) np.testing.assert_array_equal(np.asarray(fallback), np.asarray(default)) np.testing.assert_array_equal(fallback.unknowns, default.unknowns) np.testing.assert_array_equal(fallback.row_unknowns, default.row_unknowns) np.testing.assert_array_equal(fallback.col_unknowns, default.col_unknowns)
def test_array_with_unknowns(self): d = data.Table("zoo") d.Y[2] = float("nan") d.Y[6] = float("nan") d[2]["predator"] = float("nan") d[4]["predator"] = float("nan") cont = contingency.Discrete(d, "predator") assert_dist_equal(cont.array_with_unknowns, [[1, 3, 0], [11, 9, 0], [4, 8, 0], [7, 1, 0], [2, 8, 0], [18, 21, 1], [1, 4, 0], [1, 0, 1]])
def __call__(self, feature, data): if not data.domain.class_var: raise ValueError("Data with class labels required.") elif not isinstance(data.domain.class_var, DiscreteVariable): raise ValueError("Data with discrete class labels required.") cont = contingency.Discrete(data, feature) instances_with_class = np.sum( distribution.Discrete(data, data.domain.class_var)) return self.from_contingency( cont, 1. - np.sum(cont.unknowns) / instances_with_class)
def _score_disc_bin(): """Scoring for discrete attributes, with binarization""" n_values = len(attr.values) if n_values <= 2: return _score_disc() cont = contingency.Discrete(data, attr) attr_distr = np.sum(cont, axis=0) # Skip instances with missing value of the attribute cls_distr = np.sum(cont, axis=1) if np.sum(attr_distr) == 0: # all values are missing return REJECT_ATTRIBUTE best_score, best_mapping = _tree_scorers.find_binarization_entropy( cont, cls_distr, attr_distr, self.min_samples_leaf) if best_score <= 0: return REJECT_ATTRIBUTE best_score *= 1 - np.sum(cont.unknowns) / len(data) mapping, branches = MappedDiscreteNode.branches_from_mapping( col_x, best_mapping, n_values) node = MappedDiscreteNode(attr, attr_no, mapping, None) return best_score, node, branches, 2
def test_compute_contingency_metas(self): var1, var2 = self.test9.domain[-2], self.test9.domain[-4] cont = contingency.Discrete(self.test9, var1, var2) assert_dist_equal(cont, [[3, 0, 0], [0, 2, 0], [0, 0, 2], [0, 1, 0]])
def test_deepcopy(self): cont = contingency.Discrete(self.zoo, 0) dc = copy.deepcopy(cont) self.assertEqual(dc, cont) self.assertEqual(dc.col_variable, cont.col_variable) self.assertEqual(dc.row_variable, cont.row_variable)
def score_from_contingency(f): cont = contingency.Discrete(data, f) return self.from_contingency( cont, 1. - np.sum(cont.unknowns) / instances_with_class)
def _symmetrical_uncertainty(data, attr1, attr2): """Symmetrical uncertainty, Press et al., 1988.""" cont = np.asarray(contingency.Discrete(data, attr1, attr2), dtype=float) ig = InfoGain().from_contingency(cont, 1) return 2 * ig / (_entropy(cont) + _entropy(cont.T))