def column_imputer_as_value(variable, table): if variable.is_discrete: fmt = "{var.name}" value = "N/A" var = Orange.data.DiscreteVariable( fmt.format(var=variable), values=variable.values + [value], base_value=variable.base_value, compute_value=Lookup(variable, numpy.arange(len(variable.values), dtype=int), unknown=len(variable.values))) codomain = [var] transformers = [var.compute_value] elif variable.is_continuous: fmt = "{var.name}_def" var = Orange.data.DiscreteVariable(fmt.format(var=variable), values=("undef", "def"), compute_value=IsDefined(variable)) codomain = [variable, var] stats = basic_stats.BasicStats(table, variable) transformers = [ ReplaceUnknowns(variable, stats.mean), var.compute_value ] else: raise TypeError(type(variable)) return ColumnImputerAsValue(table.domain, Orange.data.Domain(codomain), transformers)
def remove_unused_values(var, data): column_data = Table.from_table( Domain([var]), data ) array = column_data.X.ravel() mask = np.isfinite(array) unique = np.array(np.unique(array[mask]), dtype=int) if len(unique) == len(var.values): return var used_values = [var.values[i] for i in unique] translation_table = np.array([np.NaN] * len(var.values)) translation_table[unique] = range(len(used_values)) base_value = -1 if 0 >= var.base_value < len(var.values): base = translation_table[var.base_value] if np.isfinite(base): base_value = int(base) return DiscreteVariable("{}".format(var.name), values=used_values, base_value=base_value, compute_value=Lookup(var, translation_table) )
def test_transform(self): lookup = Lookup(None, np.array([1, 2, 0, 2])) column = np.array([1, 2, 3, 0, np.nan, 0], dtype=np.float64) for col in [column, sp.csr_matrix(column)]: np.testing.assert_array_equal( lookup.transform(col), np.array([2, 0, 2, 1, np.nan, 1], dtype=np.float64))
def test_discrete_reorder(self): D = DiscreteVariable("D", values=("2", "3", "1", "0")) DD = apply_transform_var(D, [ CategoriesMapping((("0", "0"), ("1", "1"), ("2", "2"), ("3", "3"))) ]) self.assertSequenceEqual(DD.values, ["0", "1", "2", "3"]) self._assertLookupEquals(DD.compute_value, Lookup(D, np.array([2, 3, 1, 0])))
def merge_lookup(A, B): """ Merge two consecutive Lookup transforms into one. """ lookup_table = np.array(A.lookup_table) mask = np.isfinite(lookup_table) indices = np.array(lookup_table[mask], dtype=int) lookup_table[mask] = B.lookup_table[indices] return Lookup(A.variable, lookup_table)
def test_hash_nan(self): """ Hash should be always the same for same lookup Test introduced because of bug in numpy (PY3.10) and was present when nan in lookup table: https://github.com/numpy/numpy/issues/21210 """ lookup = Lookup(None, np.array([1, 2, np.nan, 2])) hashes = [hash(lookup) for _ in range(10)] self.assertTrue(all(x == hashes[0] for x in hashes))
def apply_transform_discete(var, trs): # type: (Orange.data.DiscreteVariable, ...) -> ... # pylint: disable=too-many-branches name, annotations = var.name, var.attributes base_value = var.base_value mapping = None ordered = var.ordered for tr in trs: if isinstance(tr, Rename): name = tr.name elif isinstance(tr, CategoriesMapping): mapping = tr.mapping elif isinstance(tr, Annotate): annotations = _parse_attributes(tr.annotations) elif isinstance(tr, ChangeOrdered): ordered = tr.ordered source_values = var.values if mapping is not None: dest_values = [cj for ci, cj in mapping if cj is not None] else: dest_values = var.values def positions(values): rval = {c: i for i, c in enumerate(values)} assert len(rval) == len(values) return rval source_codes = positions(source_values) dest_codes = positions(dest_values) if mapping is not None: # construct a lookup table lookup = np.full(len(source_values), np.nan, dtype=np.float) for ci, cj in mapping: if ci is not None and cj is not None: i, j = source_codes[ci], dest_codes[cj] lookup[i] = j if base_value != -1: base_value = lookup[base_value] if np.isnan(base_value): base_value = -1 lookup = Lookup(var, lookup) else: lookup = Identity(var) variable = Orange.data.DiscreteVariable( name, values=dest_values, base_value=base_value, compute_value=lookup, ordered=ordered, ) variable.attributes.update(annotations) return variable
def remove_unused_values(var, data): unique = nanunique(data.get_column_view(var)[0].astype(float)).astype(int) if len(unique) == len(var.values): return var used_values = [var.values[i] for i in unique] translation_table = np.array([np.NaN] * len(var.values)) translation_table[unique] = range(len(used_values)) return DiscreteVariable(var.name, values=used_values, sparse=var.sparse, compute_value=Lookup(var, translation_table))
def sort_var_values(var): newvalues = list(sorted(var.values)) if newvalues == list(var.values): return var translation_table = np.array( [float(newvalues.index(value)) for value in var.values] ) return DiscreteVariable(var.name, values=newvalues, compute_value=Lookup(var, translation_table))
def test_discrete_merge(self): D = DiscreteVariable("D", values=("2", "3", "1", "0")) mapping = ( ("0", "x"), ("1", "y"), ("2", "x"), ("3", "y"), ) tr = [CategoriesMapping(mapping)] DD = apply_transform_var(D, tr) self.assertSequenceEqual(DD.values, ["x", "y"]) self._assertLookupEquals(DD.compute_value, Lookup(D, np.array([0, 1, 1, 0])))
def test_discrete_add_drop(self): D = DiscreteVariable("D", values=("2", "3", "1", "0")) mapping = ( ("0", None), ("1", "1"), ("2", "2"), ("3", None), (None, "A"), ) tr = [CategoriesMapping(mapping)] DD = apply_transform_var(D, tr) self.assertSequenceEqual(DD.values, ["1", "2", "A"]) self._assertLookupEquals(DD.compute_value, Lookup(D, np.array([1, np.nan, 0, np.nan])))
def remove_unused_values(var, data): column_data = Table.from_table(Domain([var]), data) unique = nanunique(column_data.X).astype(int) if len(unique) == len(var.values): return var used_values = [var.values[i] for i in unique] translation_table = np.array([np.NaN] * len(var.values)) translation_table[unique] = range(len(used_values)) return DiscreteVariable("{}".format(var.name), values=used_values, compute_value=Lookup(var, translation_table), sparse=var.sparse)
def remove_unused_values(var, data): column_data = Orange.data.Table.from_table(Orange.data.Domain([var]), data) array = column_data.X.ravel() mask = numpy.isfinite(array) unique = numpy.array(numpy.unique(array[mask]), dtype=int) if len(unique) == len(var.values): return var used_values = [var.values[i] for i in unique] new_var = Orange.data.DiscreteVariable("R_{}".format(var.name), values=used_values) translation_table = numpy.array([numpy.NaN] * len(var.values)) translation_table[unique] = range(len(new_var.values)) if 0 >= var.base_value < len(var.values): base = translation_table[var.base_value] if numpy.isfinite(base): new_var.base_value = int(base) new_var.compute_value = Lookup(var, translation_table) return new_var
def test_lookup(self): t1 = Lookup(self.disc1, np.array([0, 2, 1]), 1) t1a = Lookup(self.disc1a, np.array([0, 2, 1]), 1) t2 = Lookup(self.disc2, np.array([0, 2, 1]), 1) self.assertEqual(t1, t1) self.assertEqual(t1, t1a) self.assertNotEqual(t1, t2) self.assertEqual(hash(t1), hash(t1a)) self.assertNotEqual(hash(t1), hash(t2)) t1 = Lookup(self.disc1, np.array([0, 2, 1]), 1) t1a = Lookup(self.disc1a, np.array([1, 2, 0]), 1) self.assertNotEqual(t1, t1a) self.assertNotEqual(hash(t1), hash(t1a)) t1 = Lookup(self.disc1, np.array([0, 2, 1]), 1) t1a = Lookup(self.disc1a, np.array([0, 2, 1]), 2) self.assertNotEqual(t1, t1a) self.assertNotEqual(hash(t1), hash(t1a))
def test_transform(self): lookup = Lookup(None, np.array([1, 2, 0, 2])) column = np.array([1, 2, 3, 0, np.nan, 0], dtype=np.float64) np.testing.assert_array_equal( lookup.transform(column), np.array([2, 0, 2, 1, np.nan, 1], dtype=np.float64))