def test_get_distributions(self): d = data.Table("iris") ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 5) for i in range(4): self.assertIsInstance(ddist[i], distribution.Continuous) self.assertIsInstance(ddist[-1], distribution.Discrete) freqs = np.array( [ (1.0, 1), (1.1, 1), (1.2, 2), (1.3, 7), (1.4, 12), (1.5, 14), (1.6, 7), (1.7, 4), (1.9, 2), (3.0, 1), (3.3, 2), (3.5, 2), (3.6, 1), (3.7, 1), (3.8, 1), (3.9, 3), (4.0, 5), (4.1, 3), (4.2, 4), (4.3, 2), (4.4, 4), (4.5, 8), (4.6, 3), (4.7, 5), (4.8, 4), (4.9, 5), (5.0, 4), (5.1, 8), (5.2, 2), (5.3, 2), (5.4, 2), (5.5, 3), (5.6, 6), (5.7, 3), (5.8, 3), (5.9, 2), (6.0, 2), (6.1, 3), (6.3, 1), (6.4, 1), (6.6, 1), (6.7, 2), (6.9, 1), ] ).T np.testing.assert_almost_equal(ddist[2], freqs) np.testing.assert_almost_equal(ddist[-1], [50, 50, 50])
def test_distributions(self): iris = SqlTable(self.conn, self.iris, inspect_values=True) dists = get_distributions(iris) self.assertEqual(len(dists), 5) dist = dists[0] self.assertAlmostEqual(dist.min(), 4.3) self.assertAlmostEqual(dist.max(), 7.9) self.assertAlmostEqual(dist.mean(), 5.8, 1)
def test_get_distributions(self): d = data.Table("iris") ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 5) for i in range(4): self.assertIsInstance(ddist[i], distribution.Continuous) self.assertIsInstance(ddist[-1], distribution.Discrete) freqs = np.array([ (1.0, 1), (1.1, 1), (1.2, 2), (1.3, 7), (1.4, 12), (1.5, 14), (1.6, 7), (1.7, 4), (1.9, 2), (3.0, 1), (3.3, 2), (3.5, 2), (3.6, 1), (3.7, 1), (3.8, 1), (3.9, 3), (4.0, 5), (4.1, 3), (4.2, 4), (4.3, 2), (4.4, 4), (4.5, 8), (4.6, 3), (4.7, 5), (4.8, 4), (4.9, 5), (5.0, 4), (5.1, 8), (5.2, 2), (5.3, 2), (5.4, 2), (5.5, 3), (5.6, 6), (5.7, 3), (5.8, 3), (5.9, 2), (6.0, 2), (6.1, 3), (6.3, 1), (6.4, 1), (6.6, 1), (6.7, 2), (6.9, 1), ]).T np.testing.assert_almost_equal(ddist[2], freqs) np.testing.assert_almost_equal(ddist[-1], [50, 50, 50])
def setUpClass(cls): cls.iris = data.Table("iris") cls.data = data.Table.from_numpy( data.Domain(attributes=[ data.ContinuousVariable('n1'), data.ContinuousVariable('n2'), ]), X=np.array([range(10), [1, 1, 1, 5, 5, 8, 9, np.nan, 9, 9]]).T) cls.n1, cls.n2 = distribution.get_distributions(cls.data)
def __call__(self, data): dists = distribution.get_distributions(data) new_attrs = [self.normalize(dists[i], var) for (i, var) in enumerate(data.domain.attributes)] new_class_vars = data.domain.class_vars if self.transform_class: attr_len = len(data.domain.attributes) new_class_vars = [self.normalize(dists[i + attr_len], var) for (i, var) in enumerate(data.domain.class_vars)] domain = Domain(new_attrs, new_class_vars, data.domain.metas) return data.from_table(domain, data)
def setUpClass(cls): cls.iris = data.Table("iris") cls.data = data.Table.from_numpy( data.Domain( attributes=[ data.ContinuousVariable('n1'), data.ContinuousVariable('n2'), ] ), X=np.array([range(10), [1, 1, 1, 5, 5, 8, 9, np.nan, 9, 9]]).T ) cls.n1, cls.n2 = distribution.get_distributions(cls.data)
def setUp(self): self.freqs = [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0] s = sum(self.freqs) self.rfreqs = [x / s for x in self.freqs] self.data = data.Table.from_numpy( data.Domain(attributes=[ data.DiscreteVariable('rgb', values=('r', 'g', 'b', 'a')), data.DiscreteVariable('num', values=('1', '2', '3')), ]), X=np.array([ [0, 2, 0, 1, 1, 0, np.nan, 1], [0, 2, 0, np.nan, 1, 2, np.nan, 1], ]).T) self.rgb, self.num = distribution.get_distributions(self.data)
def setUp(self): self.freqs = [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0] s = sum(self.freqs) self.rfreqs = [x / s for x in self.freqs] self.data = data.Table.from_numpy( data.Domain(attributes=[ data.DiscreteVariable("rgb", values=["r", "g", "b", "a"]), data.DiscreteVariable( "num", values=["1", "2", "3"], ordered=True), ]), X=np.array([[0, 2, 0, 1, 1, 0, np.nan, 1], [0, 2, 0, np.nan, 1, 2, np.nan, 1]]).T, ) self.rgb, self.num = distribution.get_distributions(self.data)
def setUp(self): self.freqs = [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0] s = sum(self.freqs) self.rfreqs = [x/s for x in self.freqs] self.data = data.Table.from_numpy( data.Domain( attributes=[ data.DiscreteVariable('rgb', values=['r', 'g', 'b', 'a']), data.DiscreteVariable('num', values=['1', '2', '3'], ordered=True), ] ), X=np.array([ [0, 2, 0, 1, 1, 0, np.nan, 1], [0, 2, 0, np.nan, 1, 2, np.nan, 1], ]).T ) self.rgb, self.num = distribution.get_distributions(self.data)
def iterate_states(self, state): """ Iterate through all combinations of attributes as ordered by Relief, starting with a single attribute if Mosaic is colored by class distributions, and two if by Pearson. """ # If we put initialization of `self.attrs` to `initialize`, # `score_heuristic` would be run on every call to master's `set_data`. master = self.master data = master.discrete_data min_attrs, max_attrs = self.attr_range() if min_attrs > max_attrs: return if state is None: # on the first call, compute order if self._compute_class_dists(): self.marginal = get_distribution(data, data.domain.class_var) self.marginal.normalize() state = list(range(min_attrs)) else: self.marginal = get_distributions(data) for dist in self.marginal: dist.normalize() state = list(range(min_attrs)) n_attrs = len(data.domain.attributes) while True: yield state # Reset while running; just abort if self.attr_ordering is None: break for up, _ in enumerate(state): state[up] += 1 if up + 1 == len(state) or state[up] < state[up + 1]: break state[up] = up if state[-1] == len(self.attr_ordering): if len(state) < min(max_attrs, n_attrs): state = list(range(len(state) + 1)) else: break
def iterate_states(self, state): """ Iterate through all combinations of attributes as ordered by Relief, starting with a single attribute if Mosaic is colored by class distributions, and two if by Pearson. """ # If we put initialization of `self.attrs` to `initialize`, # `score_heuristic` would be run on every call to master's `set_data`. master = self.master data = master.discrete_data if state is None: # on the first call, compute order if self._compute_class_dists(): self.marginal = get_distribution(data, data.domain.class_var) self.marginal.normalize() state = [0] else: self.marginal = get_distributions(data) for dist in self.marginal: dist.normalize() state = [0, 1] n_attrs = len(data.domain.attributes) while True: yield state # Reset while running; just abort if self.attr_ordering is None: break for up, _ in enumerate(state): state[up] += 1 if up + 1 == len(state) or state[up] < state[up + 1]: break state[up] = up if state[-1] == len(self.attr_ordering): if len(state) < min(self.max_attrs, n_attrs): state = list(range(len(state) + 1)) else: break
def test_sparse_get_distributions(self): def assert_dist_and_unknowns(computed, gold_dist): nonlocal d gold_dist = np.array(gold_dist) sum_dist = np.sum(gold_dist[1, :] if gold_dist.ndim == 2 else gold_dist) n_all = np.sum(d.W) if d.has_weights() else len(d) np.testing.assert_almost_equal(computed, gold_dist) self.assertEqual(computed.unknowns, n_all - sum_dist) domain = data.Domain( [data.DiscreteVariable("d%i" % i, values=list("abc")) for i in range(10)] + [data.ContinuousVariable("c%i" % i) for i in range(10)]) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 # ------------------------------------------------------------ # 2 2 1 1 2 1 1 1 2 0 2 # 1 1 0 0 1 2 2 1 0 # 1 2 0 # # 2 0 1 1.1 # sdata = np.array([2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 0, 1, 2, 0, 2, 0, 1, 1.1]) indices = [1, 3, 4, 5, 6, 9, 13, 14, 16, 17, 18, 2, 3, 4, 5, 6, 8, 14, 16, 17, 3, 5, 6, 2, 5, 6, 13] indptr = [0, 11, 20, 23, 23, 27] X = sp.csr_matrix((sdata, indices, indptr), shape=(5, 20)) d = data.Table.from_numpy(domain, X) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) assert_dist_and_unknowns(ddist[0], [0, 0, 0]) assert_dist_and_unknowns(ddist[1], [0, 0, 1]) assert_dist_and_unknowns(ddist[2], [0, 1, 1]) assert_dist_and_unknowns(ddist[3], [0, 2, 1]) assert_dist_and_unknowns(ddist[4], [1, 1, 0]) assert_dist_and_unknowns(ddist[5], [2, 1, 1]) assert_dist_and_unknowns(ddist[6], [1, 2, 1]) assert_dist_and_unknowns(ddist[7], [0, 0, 0]) assert_dist_and_unknowns(ddist[8], [0, 0, 1]) assert_dist_and_unknowns(ddist[9], [0, 1, 0]) z = np.zeros((2, 0)) assert_dist_and_unknowns(ddist[10], z) assert_dist_and_unknowns(ddist[11], z) assert_dist_and_unknowns(ddist[12], z) assert_dist_and_unknowns(ddist[13], [[1, 1.1], [1, 1]]) assert_dist_and_unknowns(ddist[14], [[1, 2], [1, 1]]) assert_dist_and_unknowns(ddist[15], z) assert_dist_and_unknowns(ddist[16], [[1, 2], [1, 1]]) assert_dist_and_unknowns(ddist[17], [[0], [2]]) assert_dist_and_unknowns(ddist[18], [[2], [1]]) assert_dist_and_unknowns(ddist[19], z) d.set_weights(np.array([1, 2, 3, 4, 5])) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) assert_dist_and_unknowns(ddist[0], [0, 0, 0]) assert_dist_and_unknowns(ddist[1], [0, 0, 1]) assert_dist_and_unknowns(ddist[2], [0, 2, 5]) assert_dist_and_unknowns(ddist[3], [0, 5, 1]) assert_dist_and_unknowns(ddist[4], [2, 1, 0]) assert_dist_and_unknowns(ddist[5], [7, 1, 3]) assert_dist_and_unknowns(ddist[6], [3, 7, 1]) assert_dist_and_unknowns(ddist[7], [0, 0, 0]) assert_dist_and_unknowns(ddist[8], [0, 0, 2]) assert_dist_and_unknowns(ddist[9], [0, 1, 0]) z = np.zeros((2, 0)) assert_dist_and_unknowns(ddist[10], z) assert_dist_and_unknowns(ddist[11], z) assert_dist_and_unknowns(ddist[12], z) assert_dist_and_unknowns(ddist[13], [[1, 1.1], [1, 5]]) assert_dist_and_unknowns(ddist[14], [[1, 2], [1, 2]]) assert_dist_and_unknowns(ddist[15], z) assert_dist_and_unknowns(ddist[16], [[1, 2], [2, 1]]) assert_dist_and_unknowns(ddist[17], [[0], [3]]) assert_dist_and_unknowns(ddist[18], [[2], [1]]) assert_dist_and_unknowns(ddist[19], z)
def __call__(self, data): def transform_discrete(var): if (len(var.values) < 2 or treat == Continuize.Remove or treat == Continuize.RemoveMultinomial and len(var.values) > 2): return [] if treat == Continuize.AsOrdinal: new_var = ContinuousVariable(var.name, compute_value=Identity(var)) return [new_var] if treat == Continuize.AsNormalizedOrdinal: n_values = max(1, len(var.values)) if self.zero_based: return [ContinuousVariable(var.name, compute_value=Normalizer(var, 0, 1 / (n_values - 1)))] else: return [ContinuousVariable(var.name, compute_value=Normalizer(var, (n_values - 1) / 2, 2 / (n_values - 1)))] new_vars = [] if treat == Continuize.Indicators: base = -1 elif treat in (Continuize.FirstAsBase, Continuize.RemoveMultinomial): base = max(var.base_value, 0) else: base = dists[var_ptr].modus() ind_class = [Indicator1, Indicator][self.zero_based] for i, val in enumerate(var.values): if i == base: continue new_var = ContinuousVariable( "{}={}".format(var.name, val), compute_value=ind_class(var, i)) new_vars.append(new_var) return new_vars def transform_list(s): nonlocal var_ptr new_vars = [] for var in s: if var.is_discrete: new_vars += transform_discrete(var) if needs_discrete: var_ptr += 1 else: new_var = var if new_var is not None: new_vars.append(new_var) if needs_continuous: var_ptr += 1 return new_vars treat = self.multinomial_treatment transform_class = self.transform_class domain = data if isinstance(data, Domain) else data.domain if (treat == Continuize.ReportError and any(var.is_discrete and len(var.values) > 2 for var in domain)): raise ValueError("data has multinomial attributes") needs_discrete = (treat == Continuize.FrequentAsBase and domain.has_discrete_attributes(transform_class)) needs_continuous = False if needs_discrete: if isinstance(data, Domain): raise TypeError("continuizer requires data") dists = distribution.get_distributions( data, not needs_discrete, not needs_continuous) var_ptr = 0 new_attrs = transform_list(domain.attributes) if transform_class: new_classes = transform_list(domain.class_vars) else: new_classes = domain.class_vars return Domain(new_attrs, new_classes, domain.metas)
def test_sparse_get_distributions(self): def assert_dist_and_unknowns(computed, goal_dist): nonlocal d goal_dist = np.array(goal_dist) sum_dist = np.sum(goal_dist[1, :] if goal_dist.ndim == 2 else goal_dist) n_all = np.sum(d.W) if d.has_weights() else len(d) np.testing.assert_almost_equal(computed, goal_dist) self.assertEqual(computed.unknowns, n_all - sum_dist) domain = data.Domain( [data.DiscreteVariable("d%i" % i, values=list("abc")) for i in range(10)] + [data.ContinuousVariable("c%i" % i) for i in range(10)]) # pylint: disable=bad-whitespace X = sp.csr_matrix( # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 # -------------------------------------------------------------------------------- [[0, 2, 0, 2, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 1, 0, 2, np.nan, 2, 0], [0, 0, 1, 1, np.nan, np.nan, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 1, np.nan, 0, 0], [0, 0, 0, 1, 0, 2, np.nan, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1.1, 0, 0, 0, 0, 0, 0]] ) X[0, 0] = 0 d = data.Table.from_numpy(domain, X) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) zeros = [5, 0, 0] assert_dist_and_unknowns(ddist[0], zeros) assert_dist_and_unknowns(ddist[1], [4, 0, 1]) assert_dist_and_unknowns(ddist[2], [3, 1, 1]) assert_dist_and_unknowns(ddist[3], [2, 2, 1]) assert_dist_and_unknowns(ddist[4], [3, 1, 0]) assert_dist_and_unknowns(ddist[5], [2, 1, 1]) assert_dist_and_unknowns(ddist[6], [1, 2, 1]) assert_dist_and_unknowns(ddist[7], zeros) assert_dist_and_unknowns(ddist[8], [4, 0, 1]) assert_dist_and_unknowns(ddist[9], [4, 1, 0]) zeros = [[0], [5]] assert_dist_and_unknowns(ddist[10], zeros) assert_dist_and_unknowns(ddist[11], zeros) assert_dist_and_unknowns(ddist[12], zeros) assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [3, 1, 1]]) assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [3, 1, 1]]) assert_dist_and_unknowns(ddist[15], zeros) assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [3, 1, 1]]) assert_dist_and_unknowns(ddist[17], [[0], [3]]) assert_dist_and_unknowns(ddist[18], [[0, 2], [4, 1]]) assert_dist_and_unknowns(ddist[19], zeros) d.set_weights(np.array([1, 2, 3, 4, 5])) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) assert_dist_and_unknowns(ddist[0], [15, 0, 0]) assert_dist_and_unknowns(ddist[1], [14, 0, 1]) assert_dist_and_unknowns(ddist[2], [8, 2, 5]) assert_dist_and_unknowns(ddist[3], [9, 5, 1]) assert_dist_and_unknowns(ddist[4], [12, 1, 0]) assert_dist_and_unknowns(ddist[5], [9, 1, 3]) assert_dist_and_unknowns(ddist[6], [4, 7, 1]) assert_dist_and_unknowns(ddist[7], [15, 0, 0]) assert_dist_and_unknowns(ddist[8], [13, 0, 2]) assert_dist_and_unknowns(ddist[9], [14, 1, 0]) zeros = [[0], [15]] assert_dist_and_unknowns(ddist[10], zeros) assert_dist_and_unknowns(ddist[11], zeros) assert_dist_and_unknowns(ddist[12], zeros) assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [9, 1, 5]]) assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [12, 1, 2]]) assert_dist_and_unknowns(ddist[15], zeros) assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [12, 2, 1]]) assert_dist_and_unknowns(ddist[17], [[0], [12]]) assert_dist_and_unknowns(ddist[18], [[0, 2], [14, 1]]) assert_dist_and_unknowns(ddist[19], zeros)
def test_sparse_get_distributions(self): domain = data.Domain( [data.DiscreteVariable("d%i" % i, values=list("abc")) for i in range(10)] + [data.ContinuousVariable("c%i" % i) for i in range(10)]) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 # ------------------------------------------------------------ # 2 2 1 1 2 1 1 1 2 0 2 # 1 1 0 0 1 2 2 1 0 # 1 2 0 # # 2 0 1 1.1 # sdata = np.array([2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 0, 1, 2, 0, 2, 0, 1, 1.1]) indices = [1, 3, 4, 5, 6, 9, 13, 14, 16, 17, 18, 2, 3, 4, 5, 6, 8, 14, 16, 17, 3, 5, 6, 2, 5, 6, 13] indptr = [0, 11, 20, 23, 23, 27] X = sp.csr_matrix((sdata, indices, indptr), shape=(5, 20)) d = data.Table.from_numpy(domain, X) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) np.testing.assert_almost_equal(ddist[0], [0, 0, 0]) np.testing.assert_almost_equal(ddist[1], [0, 0, 1]) np.testing.assert_almost_equal(ddist[2], [0, 1, 1]) np.testing.assert_almost_equal(ddist[3], [0, 2, 1]) np.testing.assert_almost_equal(ddist[4], [1, 1, 0]) np.testing.assert_almost_equal(ddist[5], [2, 1, 1]) np.testing.assert_almost_equal(ddist[6], [1, 2, 1]) np.testing.assert_almost_equal(ddist[7], [0, 0, 0]) np.testing.assert_almost_equal(ddist[8], [0, 0, 1]) np.testing.assert_almost_equal(ddist[9], [0, 1, 0]) z = np.zeros((2, 0)) np.testing.assert_almost_equal(ddist[10], z) np.testing.assert_almost_equal(ddist[11], z) np.testing.assert_almost_equal(ddist[12], z) np.testing.assert_almost_equal(ddist[13], [[1, 1.1], [1, 1]]) np.testing.assert_almost_equal(ddist[14], [[1, 2], [1, 1]]) np.testing.assert_almost_equal(ddist[15], z) np.testing.assert_almost_equal(ddist[16], [[1, 2], [1, 1]]) np.testing.assert_almost_equal(ddist[17], [[0], [2]]) np.testing.assert_almost_equal(ddist[18], [[2], [1]]) np.testing.assert_almost_equal(ddist[19], z) d.set_weights(np.array([1, 2, 3, 4, 5])) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) np.testing.assert_almost_equal(ddist[0], [0, 0, 0]) np.testing.assert_almost_equal(ddist[1], [0, 0, 1]) np.testing.assert_almost_equal(ddist[2], [0, 2, 5]) np.testing.assert_almost_equal(ddist[3], [0, 5, 1]) np.testing.assert_almost_equal(ddist[4], [2, 1, 0]) np.testing.assert_almost_equal(ddist[5], [7, 1, 3]) np.testing.assert_almost_equal(ddist[6], [3, 7, 1]) np.testing.assert_almost_equal(ddist[7], [0, 0, 0]) np.testing.assert_almost_equal(ddist[8], [0, 0, 2]) np.testing.assert_almost_equal(ddist[9], [0, 1, 0]) z = np.zeros((2, 0)) np.testing.assert_almost_equal(ddist[10], z) np.testing.assert_almost_equal(ddist[11], z) np.testing.assert_almost_equal(ddist[12], z) np.testing.assert_almost_equal(ddist[13], [[1, 1.1], [1, 5]]) np.testing.assert_almost_equal(ddist[14], [[1, 2], [1, 2]]) np.testing.assert_almost_equal(ddist[15], z) np.testing.assert_almost_equal(ddist[16], [[1, 2], [2, 1]]) np.testing.assert_almost_equal(ddist[17], [[0], [3]]) np.testing.assert_almost_equal(ddist[18], [[2], [1]]) np.testing.assert_almost_equal(ddist[19], z)
def __call__(self, data): def transform_discrete(var): if (len(var.values) < 2 or treat == Continuize.Remove or treat == Continuize.RemoveMultinomial and len(var.values) > 2): return [] if treat == Continuize.AsOrdinal: new_var = ContinuousVariable(var.name, compute_value=Identity(var)) return [new_var] if treat == Continuize.AsNormalizedOrdinal: n_values = max(1, len(var.values)) if self.zero_based: return [ ContinuousVariable(var.name, compute_value=Normalizer( var, 0, 1 / (n_values - 1))) ] else: return [ ContinuousVariable(var.name, compute_value=Normalizer( var, (n_values - 1) / 2, 2 / (n_values - 1))) ] new_vars = [] if treat == Continuize.Indicators: base = -1 elif treat in (Continuize.FirstAsBase, Continuize.RemoveMultinomial): base = max(var.base_value, 0) else: base = dists[var_ptr].modus() ind_class = [Indicator1, Indicator][self.zero_based] for i, val in enumerate(var.values): if i == base: continue new_var = ContinuousVariable("{}={}".format(var.name, val), compute_value=ind_class(var, i)) new_vars.append(new_var) return new_vars def transform_list(s): nonlocal var_ptr new_vars = [] for var in s: if var.is_discrete: new_vars += transform_discrete(var) if needs_discrete: var_ptr += 1 else: new_var = var if new_var is not None: new_vars.append(new_var) if needs_continuous: var_ptr += 1 return new_vars treat = self.multinomial_treatment transform_class = self.transform_class domain = data if isinstance(data, Domain) else data.domain if (treat == Continuize.ReportError and any(var.is_discrete and len(var.values) > 2 for var in domain)): raise ValueError("data has multinomial attributes") needs_discrete = (treat == Continuize.FrequentAsBase and domain.has_discrete_attributes(transform_class)) needs_continuous = False if needs_discrete: if isinstance(data, Domain): raise TypeError("continuizer requires data") dists = distribution.get_distributions(data, not needs_discrete, not needs_continuous) var_ptr = 0 new_attrs = transform_list(domain.attributes) if transform_class: new_classes = transform_list(domain.class_vars) else: new_classes = domain.class_vars return Domain(new_attrs, new_classes, domain.metas)
def __call__(self, data): def transform_discrete(var): if (len(var.values) < 2 or treat == self.Ignore or treat == self.IgnoreMulti and len(var.values) > 2): return [] if treat == self.AsOrdinal: new_var = ContinuousVariable(var.name) new_var.get_value_from = Identity(var) return [new_var] if treat == self.AsNormalizedOrdinal: new_var = ContinuousVariable(var.name) n_values = max(1, len(var.values)) if self.zero_based: new_var.get_value_from = \ Normalizer(var, 0, 1 / (n_values - 1)) else: new_var.get_value_from = \ Normalizer(var, (n_values - 1) / 2, 2 / (n_values - 1)) return [new_var] new_vars = [] if treat == self.NValues: base = -1 elif treat == self.LowestIsBase or treat == self.IgnoreMulti: base = max(var.base_value, 0) else: base = dists[var_ptr].modus() IndClass = [Indicator_1, Indicator][self.zero_based] for i, val in enumerate(var.values): if i == base: continue new_var = ContinuousVariable( "{}={}".format(var.name, val)) new_var.get_value_from = IndClass(var, i) new_vars.append(new_var) return new_vars def transform_continuous(var): if self.normalize_continuous == self.Leave: return var elif self.normalize_continuous == self.NormalizeBySpan: new_var = ContinuousVariable(var.name) dma, dmi = dists[var_ptr].max(), dists[var_ptr].min() diff = dma - dmi if diff < 1e-15: diff = 1 if self.zero_based: new_var.get_value_from = Normalizer(var, dmi, 1 / diff) else: new_var.get_value_from = Normalizer(var, (dma + dmi) / 2, 2 / diff) return new_var elif self.normalize_continuous == self.NormalizeBySD: new_var = ContinuousVariable(var.name) avg = dists[var_ptr].mean() sd = dists[var_ptr].standard_deviation() new_var.get_value_from = Normalizer(var, avg, 1 / sd) return new_var def transform_list(s): nonlocal var_ptr new_vars = [] for var in s: if isinstance(var, DiscreteVariable): new_vars += transform_discrete(var) if needs_discrete: var_ptr += 1 else: new_var = transform_continuous(var) if new_var is not None: new_vars.append(new_var) if needs_continuous: var_ptr += 1 return new_vars treat = self.multinomial_treatment transform_class = self.transform_class domain = data if isinstance(data, Domain) else data.domain if treat == self.ReportError and any( isinstance(var, DiscreteVariable) and len(var.values) > 2 for var in domain): raise ValueError("data has multinomial attributes") needs_discrete = (treat == self.FrequentIsBase and domain.has_discrete_attributes(transform_class)) needs_continuous = (not self.normalize_continuous == self.Leave and domain.has_continuous_attributes(transform_class)) if needs_discrete or needs_continuous: if isinstance(data, Domain): raise TypeError("continuizer requires data") dists = distribution.get_distributions( data, not needs_discrete, not needs_continuous) var_ptr = 0 new_attrs = transform_list(domain.attributes) if transform_class: new_classes = transform_list(domain.class_vars) else: new_classes = domain.class_vars return Domain(new_attrs, new_classes, domain.metas)
def test_sparse_get_distributions(self): def assert_dist_and_unknowns(computed, goal_dist): nonlocal d goal_dist = np.array(goal_dist) sum_dist = np.sum(goal_dist[1, :] if goal_dist.ndim == 2 else goal_dist) n_all = np.sum(d.W) if d.has_weights() else len(d) assert_dist_almost_equal(computed, goal_dist) self.assertEqual(computed.unknowns, n_all - sum_dist) domain = data.Domain([ data.DiscreteVariable("d%i" % i, values=tuple("abc")) for i in range(10) ] + [data.ContinuousVariable("c%i" % i) for i in range(10)]) # pylint: disable=bad-whitespace X = sp.csr_matrix( # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 # -------------------------------------------------------------------------------- [[0, 2, 0, 2, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 1, 0, 2, np.nan, 2, 0], [ 0, 0, 1, 1, np.nan, np.nan, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 1, np.nan, 0, 0 ], [0, 0, 0, 1, 0, 2, np.nan, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1.1, 0, 0, 0, 0, 0, 0]]) warnings.filterwarnings("ignore", ".*", sp.SparseEfficiencyWarning) X[0, 0] = 0 d = data.Table.from_numpy(domain, X) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) zeros = [5, 0, 0] assert_dist_and_unknowns(ddist[0], zeros) assert_dist_and_unknowns(ddist[1], [4, 0, 1]) assert_dist_and_unknowns(ddist[2], [3, 1, 1]) assert_dist_and_unknowns(ddist[3], [2, 2, 1]) assert_dist_and_unknowns(ddist[4], [3, 1, 0]) assert_dist_and_unknowns(ddist[5], [2, 1, 1]) assert_dist_and_unknowns(ddist[6], [1, 2, 1]) assert_dist_and_unknowns(ddist[7], zeros) assert_dist_and_unknowns(ddist[8], [4, 0, 1]) assert_dist_and_unknowns(ddist[9], [4, 1, 0]) zeros = [[0], [5]] assert_dist_and_unknowns(ddist[10], zeros) assert_dist_and_unknowns(ddist[11], zeros) assert_dist_and_unknowns(ddist[12], zeros) assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [3, 1, 1]]) assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [3, 1, 1]]) assert_dist_and_unknowns(ddist[15], zeros) assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [3, 1, 1]]) assert_dist_and_unknowns(ddist[17], [[0], [3]]) assert_dist_and_unknowns(ddist[18], [[0, 2], [4, 1]]) assert_dist_and_unknowns(ddist[19], zeros) with d.unlocked(): d.set_weights(np.array([1, 2, 3, 4, 5])) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) assert_dist_and_unknowns(ddist[0], [15, 0, 0]) assert_dist_and_unknowns(ddist[1], [14, 0, 1]) assert_dist_and_unknowns(ddist[2], [8, 2, 5]) assert_dist_and_unknowns(ddist[3], [9, 5, 1]) assert_dist_and_unknowns(ddist[4], [12, 1, 0]) assert_dist_and_unknowns(ddist[5], [9, 1, 3]) assert_dist_and_unknowns(ddist[6], [4, 7, 1]) assert_dist_and_unknowns(ddist[7], [15, 0, 0]) assert_dist_and_unknowns(ddist[8], [13, 0, 2]) assert_dist_and_unknowns(ddist[9], [14, 1, 0]) zeros = [[0], [15]] assert_dist_and_unknowns(ddist[10], zeros) assert_dist_and_unknowns(ddist[11], zeros) assert_dist_and_unknowns(ddist[12], zeros) assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [9, 1, 5]]) assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [12, 1, 2]]) assert_dist_and_unknowns(ddist[15], zeros) assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [12, 2, 1]]) assert_dist_and_unknowns(ddist[17], [[0], [12]]) assert_dist_and_unknowns(ddist[18], [[0, 2], [14, 1]]) assert_dist_and_unknowns(ddist[19], zeros)
def __call__(self, data): def transform_discrete(var): if (len(var.values) < 2 or treat == self.Ignore or treat == self.IgnoreMulti and len(var.values) > 2): return [] if treat == self.AsOrdinal: new_var = ContinuousVariable(var.name) new_var.get_value_from = Identity(var) return [new_var] if treat == self.AsNormalizedOrdinal: new_var = ContinuousVariable(var.name) n_values = max(1, len(var.values)) if self.zero_based: new_var.get_value_from = \ Normalizer(var, 0, 1 / (n_values - 1)) else: new_var.get_value_from = \ Normalizer(var, (n_values - 1) / 2, 2 / (n_values - 1)) return [new_var] new_vars = [] if treat == self.NValues: base = -1 elif treat == self.LowestIsBase or treat == self.IgnoreMulti: base = max(var.base_value, 0) else: base = dists[var_ptr].modus() IndClass = [Indicator_1, Indicator][self.zero_based] for i, val in enumerate(var.values): if i == base: continue new_var = ContinuousVariable("{}={}".format(var.name, val)) new_var.get_value_from = IndClass(var, i) new_vars.append(new_var) return new_vars def transform_continuous(var): if self.normalize_continuous == self.Leave: return var elif self.normalize_continuous == self.NormalizeBySpan: new_var = ContinuousVariable(var.name) dma, dmi = dists[var_ptr].max(), dists[var_ptr].min() diff = dma - dmi if diff < 1e-15: diff = 1 if self.zero_based: new_var.get_value_from = Normalizer(var, dmi, 1 / diff) else: new_var.get_value_from = Normalizer( var, (dma + dmi) / 2, 2 / diff) return new_var elif self.normalize_continuous == self.NormalizeBySD: new_var = ContinuousVariable(var.name) avg = dists[var_ptr].mean() sd = dists[var_ptr].standard_deviation() new_var.get_value_from = Normalizer(var, avg, 1 / sd) return new_var def transform_list(s): nonlocal var_ptr new_vars = [] for var in s: if isinstance(var, DiscreteVariable): new_vars += transform_discrete(var) if needs_discrete: var_ptr += 1 else: new_var = transform_continuous(var) if new_var is not None: new_vars.append(new_var) if needs_continuous: var_ptr += 1 return new_vars treat = self.multinomial_treatment transform_class = self.transform_class domain = data if isinstance(data, Domain) else data.domain if treat == self.ReportError and any( isinstance(var, DiscreteVariable) and len(var.values) > 2 for var in domain): raise ValueError("data has multinomial attributes") needs_discrete = (treat == self.FrequentIsBase and domain.has_discrete_attributes(transform_class)) needs_continuous = (not self.normalize_continuous == self.Leave and domain.has_continuous_attributes(transform_class)) if needs_discrete or needs_continuous: if isinstance(data, Domain): raise TypeError("continuizer requires data") dists = distribution.get_distributions(data, not needs_discrete, not needs_continuous) var_ptr = 0 new_attrs = transform_list(domain.attributes) if transform_class: new_classes = transform_list(domain.class_vars) else: new_classes = domain.class_vars return Domain(new_attrs, new_classes, domain.metas)