def commit(self): self.Warning.renamed_vars.clear() if not self.data: self.Outputs.data.send(None) return data = np.array(self.data) if self.hasAttr2: X, Y = data[:, :2], data[:, 2] proposed = [self.attr1.strip(), self.attr2.strip()] else: X, Y = data[:, np.newaxis, 0], data[:, 2] proposed = [self.attr1.strip()] if len(np.unique(Y)) >= 2: proposed.append("Class") unique_names, renamed = get_unique_names_duplicates(proposed, True) domain = Domain((map(ContinuousVariable, unique_names[:-1])), DiscreteVariable(unique_names[-1], values=tuple(self.class_model))) data = Table.from_numpy(domain, X, Y) else: unique_names, renamed = get_unique_names_duplicates(proposed, True) domain = Domain(map(ContinuousVariable, unique_names)) data = Table.from_numpy(domain, X) if renamed: self.Warning.renamed_vars(", ".join(renamed)) self.plot.getAxis("bottom").setLabel(unique_names[0]) self.plot.getAxis("left").setLabel(unique_names[1]) data.name = self.table_name self.Outputs.data.send(data)
def test_get_unique_names_from_duplicates(self): self.assertEqual( get_unique_names_duplicates(["foo", "bar", "baz"]), ["foo", "bar", "baz"]) self.assertEqual( get_unique_names_duplicates(["foo", "bar", "baz", "bar"]), ["foo", "bar (1)", "baz", "bar (2)"])
def merge_data(self, domain, values, ids=None): X, metas, class_vars = None, None, None renamed = [] for val in domain.values(): names = [var.name for var in val] unique_names = get_unique_names_duplicates(names) for n, u, idx, var in zip(names, unique_names, count(), val): if n != u: val[idx] = var.copy(name=u) renamed.append(n) if renamed: self.Warning.renamed_vars(', '.join(renamed)) if 'attributes' in values: X = np.hstack(values['attributes']) if 'metas' in values: metas = np.hstack(values['metas']) n = len(metas) if 'class_vars' in values: class_vars = np.hstack(values['class_vars']) n = len(class_vars) if X is None: X = np.empty((n, 0)) table = Table.from_numpy(Domain(**domain), X, class_vars, metas) if ids is not None: table.ids = ids return table
def create_scores_table(self, labels): self.Warning.renamed_variables.clear() model_list = self.ranksModel.tolist() if not model_list or len( model_list[0]) == 1: # Empty or just n_values column return None unique, renamed = get_unique_names_duplicates(labels + ('Feature', ), return_duplicated=True) if renamed: self.Warning.renamed_variables(', '.join(renamed)) domain = Domain([ContinuousVariable(label) for label in unique[:-1]], metas=[StringVariable(unique[-1])]) # Prevent np.inf scores finfo = np.finfo(np.float64) scores = np.clip(np.array(model_list)[:, 1:], finfo.min, finfo.max) feature_names = np.array([a.name for a in self.data.domain.attributes]) # Reshape to 2d array as Table does not like 1d arrays feature_names = feature_names[:, None] new_table = Table(domain, scores, metas=feature_names) new_table.name = "Feature Scores" return new_table
def __get_pivot_tab_domain(self, val_var, X, X_h, X_v, X_t, agg_funs): def map_values(index, _X): values = np.unique(_X[:, index]) values = np.delete(values, np.where(values == "nan")[0]) for j, value in enumerate(values): _X[:, index][_X[:, index] == value] = j return values vals = np.array(self._col_var.values)[self._col_var_groups.astype(int)] if not val_var or val_var.is_continuous: cv = ContinuousVariable attrs = [[cv(f"{v}", 1) for v in vals]] * 2 attrs.extend([[cv("Total", 1)]] * 2) else: attrs = [] for x in (X, X_h): attrs.append([ DiscreteVariable(f"{v}", map_values(i, x)) for i, v in enumerate(vals, 2) ]) for x in (X_v, X_t): attrs.append([DiscreteVariable("Total", map_values(0, x))]) row_var_h = DiscreteVariable(self._row_var.name, values=["Total"]) aggr_attr = DiscreteVariable('Aggregate', [str(f) for f in agg_funs]) same_row_col = self._col_var is self._row_var extra_vars = [self._row_var, aggr_attr] uniq_a = get_unique_names_duplicates([v.name for v in extra_vars] + [atr.name for atr in attrs[0]]) for (idx, var), u in zip(enumerate(chain(extra_vars, attrs[0])), uniq_a): if var.name == u: continue if idx == 0: self.renamed.append(self._row_var.name) self._row_var = self._row_var.copy(name=u) if same_row_col: self._col_var = self._row_var row_var_h = row_var_h.copy(name=u) elif idx == 1: self.renamed.append(aggr_attr.name) aggr_attr = aggr_attr.copy(name=u) else: self.renamed.append(var.name) attrs[0][idx - 2] = var.copy(name=u) attrs[1][idx - 2] = var.copy(name=u) if same_row_col: vals = tuple(v.name for v in attrs[0]) self._row_var.make(self._row_var.name, values=vals) vals = tuple(v.name for v in attrs[2]) row_var_h.make(row_var_h.name, vals) return (Domain([self._row_var, aggr_attr] + attrs[0]), Domain([row_var_h, aggr_attr] + attrs[1]), Domain(attrs[2]), Domain(attrs[3]))
def __init__(self, headers: List): """ Parameters ---------- headers: List Header rows, to be used for constructing domain. """ names, types, flags = self.create_header_data(headers) self.names = get_unique_names_duplicates(names) self.types = types self.flags = flags
def _merge_domains(domains): def fix_names(part): for i, attr, name in zip(count(), part, name_iter): if attr.name != name: part[i] = attr.renamed(name) parts = [ _get_part(domains, set.union, part) for part in ("attributes", "class_vars", "metas") ] all_names = [var.name for var in chain(*parts)] name_iter = iter(get_unique_names_duplicates(all_names)) for part in parts: fix_names(part) return Domain(*parts)
def merge_domains(self, domains): def fix_names(part): for i, attr, name in zip(count(), part, name_iter): if attr.name != name: part[i] = attr.renamed(name) self.Warning.renamed_variables() oper = set.union if self.merge_type == OWConcatenate.MergeUnion \ else set.intersection parts = [self._get_part(domains, oper, part) for part in ("attributes", "class_vars", "metas")] all_names = [var.name for var in chain(*parts)] name_iter = iter(get_unique_names_duplicates(all_names)) for part in parts: fix_names(part) domain = Orange.data.Domain(*parts) return domain
def _domain_rename_duplicates(self, attributes, class_vars, metas): """Check for duplicate variable names in domain. If any, rename the variables, by replacing them with new ones (names are appended a number). """ attrs, cvars, mets = [], [], [] n_attrs, n_cvars, n_metas = len(attributes), len(class_vars), len(metas) lists = [attrs] * n_attrs + [cvars] * n_cvars + [mets] * n_metas all_vars = attributes + class_vars + metas proposed_names = [m.name for m in all_vars] unique_names = get_unique_names_duplicates(proposed_names) duplicates = set() for p_name, u_name, var, c in zip(proposed_names, unique_names, all_vars, lists): if p_name != u_name: duplicates.add(p_name) var = var.copy(name=u_name) c.append(var) if duplicates: self.Warning.renamed_vars(", ".join(duplicates)) return Orange.data.Domain(attrs, cvars, mets)
def test_get_unique_names_from_duplicates(self): self.assertEqual(get_unique_names_duplicates(["foo", "bar", "baz"]), ["foo", "bar", "baz"]) self.assertEqual( get_unique_names_duplicates(["foo", "bar", "baz", "bar"]), ["foo", "bar (1)", "baz", "bar (2)"])
def get_domain(self, domain, data, deduplicate=False): """ Create domain (and dataset) from changes made in the widget. Returns ------- Args: domain (Domain): original domain data (Table): original data deduplicate (bool): if True, variable names are deduplicated and the result contains an additional list with names of renamed variables Returns: (new_domain, [attribute_columns, class_var_columns, meta_columns]) or (new_domain, [attribute_columns, class_var_columns, meta_columns], renamed) """ # Allow type-checking with type() instead of isinstance() for exact comparison # pylint: disable=unidiomatic-typecheck variables = self.model().variables places = [[], [], []] # attributes, class_vars, metas cols = [[], [], []] # Xcols, Ycols, Mcols def numbers_are_round(var, col_data): if type(var) == ContinuousVariable: data = np.asarray(col_data.data) # Works for dense and sparse data = data[~np.isnan(data)] return (data == data.astype(int)).all() return False # Exit early with original domain if the user didn't actually change anything if all((name == orig_var.name and tpe == type(orig_var) and place == orig_plc) for (name, tpe, place, _, _), (orig_var, orig_plc) in zip( variables, chain(((at, Place.feature) for at in domain.attributes), ( (cl, Place.class_var) for cl in domain.class_vars), ( (mt, Place.meta) for mt in domain.metas)))): if deduplicate: return domain, [data.X, data.Y, data.metas], [] else: return domain, [data.X, data.Y, data.metas] relevant_names = [var[0] for var in variables if var[2] != Place.skip] if deduplicate: renamed_iter = iter(get_unique_names_duplicates(relevant_names)) else: renamed_iter = iter(relevant_names) renamed = [] for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \ zip(variables, chain([(at, Place.feature) for at in domain.attributes], [(cl, Place.class_var) for cl in domain.class_vars], [(mt, Place.meta) for mt in domain.metas])): if place == Place.skip: continue new_name = next(renamed_iter) if new_name != name and name not in renamed: renamed.append(name) col_data = self._get_column(data, orig_var, orig_plc) is_sparse = sp.issparse(col_data) if new_name == orig_var.name and tpe == type(orig_var): var = orig_var elif tpe == type(orig_var): var = orig_var.copy(name=new_name) elif tpe == DiscreteVariable: values = natural_sorted( list( str(i) for i in unique(col_data) if not self._is_missing(i))) round_numbers = numbers_are_round(orig_var, col_data) col_data = [ np.nan if self._is_missing(x) else values.index(str(x)) for x in self._iter_vals(col_data) ] if round_numbers: values = [str(int(float(v))) for v in values] var = tpe(new_name, values) col_data = self._to_column(col_data, is_sparse) elif tpe == StringVariable: var = tpe.make(new_name) if type(orig_var) in [DiscreteVariable, TimeVariable]: col_data = [ orig_var.repr_val(x) if not np.isnan(x) else "" for x in self._iter_vals(col_data) ] elif type(orig_var) == ContinuousVariable: round_numbers = numbers_are_round(orig_var, col_data) col_data = [ '' if np.isnan(x) else str(int(x)) if round_numbers else orig_var.repr_val(x) for x in self._iter_vals(col_data) ] # don't obey sparsity for StringVariable since they are # in metas which are transformed to dense below col_data = self._to_column(col_data, False, dtype=object) elif tpe == ContinuousVariable and type( orig_var) == DiscreteVariable: var = tpe.make(new_name) if may_be_numeric: col_data = [ np.nan if self._is_missing(x) else float( orig_var.values[int(x)]) for x in self._iter_vals(col_data) ] col_data = self._to_column(col_data, is_sparse) else: var = tpe(new_name) places[place].append(var) cols[place].append(col_data) # merge columns for X, Y and metas feats = cols[Place.feature] X = self._merge(feats) if feats else np.empty((len(data), 0)) Y = self._merge(cols[Place.class_var], force_dense=True) m = self._merge(cols[Place.meta], force_dense=True) domain = Domain(*places) if deduplicate: return domain, [X, Y, m], renamed else: return domain, [X, Y, m]
def test_get_unique_names_from_duplicates(self): self.assertEqual(get_unique_names_duplicates(["foo", "bar", "baz"]), ["foo", "bar", "baz"]) self.assertEqual( get_unique_names_duplicates(["foo", "bar", "baz", "bar"]), ["foo", "bar (1)", "baz", "bar (2)"]) self.assertEqual(get_unique_names_duplicates(["x", "x", "x (1)"]), ["x (2)", "x (3)", "x (1)"]) self.assertEqual( get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"]), ["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"]) self.assertEqual( get_unique_names_duplicates(["iris", "iris", "iris (1)"]), ["iris (2)", "iris (3)", "iris (1)"]) self.assertEqual( get_unique_names_duplicates(["foo", "bar", "baz"], return_duplicated=True), (["foo", "bar", "baz"], [])) self.assertEqual( get_unique_names_duplicates(["foo", "bar", "baz", "bar"], return_duplicated=True), (["foo", "bar (1)", "baz", "bar (2)"], ["bar"])) self.assertEqual( get_unique_names_duplicates(["x", "x", "x (1)"], return_duplicated=True), (["x (2)", "x (3)", "x (1)"], ["x"])) self.assertEqual( get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"], return_duplicated=True), (["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)" ], ["x (2)", "x"])) self.assertEqual( get_unique_names_duplicates(["x", "", "", None, None, "x"]), ["x (1)", "", "", None, None, "x (2)"]) self.assertEqual( get_unique_names_duplicates( ["iris", "iris", "iris (1)", "iris (2)"], return_duplicated=True), (["iris (3)", "iris (4)", "iris (1)", "iris (2)"], ["iris"])) self.assertEqual( get_unique_names_duplicates( ["iris (1) (1)", "iris (1)", "iris (1)"]), ["iris (1) (1)", "iris (1) (2)", "iris (1) (3)"]) self.assertEqual( get_unique_names_duplicates( ["iris (1) (1)", "iris (1)", "iris (1)", "iris", "iris"]), [ "iris (1) (1)", "iris (1) (2)", "iris (1) (3)", "iris (2)", "iris (3)" ])