def test_unique(self): for X in self.data: X_sparse = csr_matrix(X) np.testing.assert_array_equal( unique(X_sparse, return_counts=False), np.unique(X, return_counts=False)) for a1, a2 in zip(unique(X_sparse, return_counts=True), np.unique(X, return_counts=True)): np.testing.assert_array_equal(a1, a2)
def test_unique(self): for X in self.data: X_sparse = csr_matrix(X) np.testing.assert_array_equal( unique(X_sparse, return_counts=False), np.unique(X, return_counts=False)) for a1, a2 in zip(unique(X_sparse, return_counts=True), np.unique(X, return_counts=True)): np.testing.assert_array_equal(a1, a2)
def test_sparse_explicit_zeros(self): # Use `lil_matrix` to fix sparse warning for matrix construction x = lil_matrix(np.eye(3)) x[0, 1] = 0 x[1, 0] = 0 x = x.tocsr() # Test against identity matrix y = csr_matrix(np.eye(3)) np.testing.assert_array_equal(unique(y, return_counts=True), unique(x, return_counts=True))
def test_sparse_explicit_zeros(self): # Use `lil_matrix` to fix sparse warning for matrix construction x = lil_matrix(np.eye(3)) x[0, 1] = 0 x[1, 0] = 0 x = x.tocsr() # Test against identity matrix y = csr_matrix(np.eye(3)) np.testing.assert_array_equal( unique(y, return_counts=True), unique(x, return_counts=True), )
def test_returns_counts(self, array): # pylint: disable=bad-whitespace x = array([[-1., 1., 0., 2., 3., np.nan], [0., 0., 0., 3., 5., np.nan], [-1., 0., 0., 1., 7., 6.]]) expected = [2, 6, 2, 1, 2, 1, 1, 1, 1, 1] np.testing.assert_equal(unique(x, return_counts=True)[1], expected)
def _init_feature_marker_values(self): self.feature_marker_values = [] cls_index = self.target_class_index instances = Table(self.domain, self.instances) \ if self.instances else None values = [] for i, attr in enumerate(self.domain.attributes): value, feature_val = 0, None if len(self.log_reg_coeffs): if attr.is_discrete: ind, n = unique(self.data.X[:, i], return_counts=True) feature_val = np.nan_to_num(ind[np.argmax(n)]) else: feature_val = nanmean(self.data.X[:, i]) # If data is provided on a separate signal, use the first data # instance to position the points instead of the mean inst_in_dom = instances and attr in instances.domain if inst_in_dom and not np.isnan(instances[0][attr]): feature_val = instances[0][attr] if feature_val is not None: value = (self.points[i][cls_index][int(feature_val)] if attr.is_discrete else self.log_reg_coeffs_orig[i][cls_index][0] * feature_val) values.append(value) self.feature_marker_values = np.asarray(values)
def test_returns_unique_values(self, array): # pylint: disable=bad-whitespace x = array([[-1., 1., 0., 2., 3., np.nan], [0., 0., 0., 3., 5., np.nan], [-1., 0., 0., 1., 7., 6.]]) expected = [-1, 0, 1, 2, 3, 5, 6, 7, np.nan, np.nan] np.testing.assert_equal(unique(x, return_counts=False), expected)
def _init_feature_marker_values(self): self.feature_marker_values = [] cls_index = self.target_class_index instances = Table(self.domain, self.instances) \ if self.instances else None values = [] for i, attr in enumerate(self.domain.attributes): value, feature_val = 0, None if len(self.log_reg_coeffs): if attr.is_discrete: ind, n = unique(self.data.X[:, i], return_counts=True) feature_val = np.nan_to_num(ind[np.argmax(n)]) else: feature_val = nanmean(self.data.X[:, i]) # If data is provided on a separate signal, use the first data # instance to position the points instead of the mean inst_in_dom = instances and attr in instances.domain if inst_in_dom and not np.isnan(instances[0][attr]): feature_val = instances[0][attr] if feature_val is not None: value = (self.points[i][cls_index][int(feature_val)] if attr.is_discrete else self.log_reg_coeffs_orig[i][cls_index][0] * feature_val) values.append(value) self.feature_marker_values = np.asarray(values)
def set_train_data(self, data): """ Set the input training dataset. Parameters ---------- data : Optional[Orange.data.Table] """ self.cancel() self.Information.data_sampled.clear() self.Error.train_data_error.clear() if data is not None: data_errors = [ ("Train dataset is empty.", len(data) == 0), ("Train data input requires a target variable.", not data.domain.class_vars), ("Too many target variables.", len(data.domain.class_vars) > 1), ("Target variable has no values.", np.isnan(data.Y).all()), ("Target variable has only one value.", data.domain.has_discrete_class and len(unique(data.Y)) < 2), ("Data has no features to learn from.", data.X.shape[1] == 0), ] for error_msg, cond in data_errors: if cond: self.Error.train_data_error(error_msg) data = None break if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) else: self.Information.data_sampled() data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(AUTO_DL_LIMIT, partial=True) data = Table(data_sample) self.train_data_missing_vals = \ data is not None and np.isnan(data.Y).any() if self.train_data_missing_vals or self.test_data_missing_vals: self.Warning.missing_data(self._which_missing_data()) if data: data = HasClass()(data) else: self.Warning.missing_data.clear() self.data = data self.closeContext() self._update_scorers() self._update_controls() if data is not None: self._update_class_selection() self.openContext(data.domain) if self.fold_feature_selected and bool(self.feature_model): self.resampling = OWTestAndScore.FeatureFold self._invalidate()
def test_returns_counts(self, array): # pylint: disable=bad-whitespace x = array([[-1., 1., 0., 2., 3., np.nan], [ 0., 0., 0., 3., 5., np.nan], [-1., 0., 0., 1., 7., 6.]]) expected = [2, 6, 2, 1, 2, 1, 1, 1, 1, 1] np.testing.assert_equal(unique(x, return_counts=True)[1], expected)
def test_returns_unique_values(self, array): # pylint: disable=bad-whitespace x = array([[-1., 1., 0., 2., 3., np.nan], [ 0., 0., 0., 3., 5., np.nan], [-1., 0., 0., 1., 7., 6.]]) expected = [-1, 0, 1, 2, 3, 5, 6, 7, np.nan, np.nan] np.testing.assert_equal(unique(x, return_counts=False), expected)
def get_domain(self, domain, data): """Create domain (and dataset) from changes made in the widget. Parameters ---------- domain : old domain data : source data Returns ------- (new_domain, [attribute_columns, class_var_columns, meta_columns]) """ variables = self.model().variables places = [[], [], []] # attributes, class_vars, metas cols = [[], [], []] # Xcols, Ycols, Mcols for (name, tpe, place, _, _), (orig_var, orig_plc) in \ zip(variables, chain([(at, Place.feature) for at in domain.attributes], [(cl, Place.class_var) for cl in domain.class_vars], [(mt, Place.meta) for mt in domain.metas])): if place == Place.skip: continue col_data = self._get_column(data, orig_var, orig_plc) is_sparse = sp.issparse(col_data) if name == orig_var.name and tpe == type(orig_var): var = orig_var elif tpe == type(orig_var): # change the name so that all_vars will get the correct name orig_var.name = name var = orig_var elif tpe == DiscreteVariable: values = list(str(i) for i in unique(col_data) if not self._is_missing(i)) var = tpe(name, values) col_data = [np.nan if self._is_missing(x) else values.index(str(x)) for x in self._iter_vals(col_data)] col_data = self._to_column(col_data, is_sparse) elif tpe == StringVariable and type(orig_var) == DiscreteVariable: var = tpe(name) col_data = [orig_var.repr_val(x) if not np.isnan(x) else "" for x in self._iter_vals(col_data)] # don't obey sparsity for StringVariable since they are # in metas which are transformed to dense below col_data = self._to_column(col_data, False, dtype=object) else: var = tpe(name) places[place].append(var) cols[place].append(col_data) # merge columns for X, Y and metas feats = cols[Place.feature] X = self._merge(feats) if len(feats) else np.empty((len(data), 0)) Y = self._merge(cols[Place.class_var], force_dense=True) m = self._merge(cols[Place.meta], force_dense=True) domain = Domain(*places) return domain, [X, Y, m]
def test_unique_explicit_zeros(self): x1 = csr_matrix(np.eye(3)) x2 = csr_matrix(np.eye(3)) # set some of-diagonal to explicit zeros with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=sp.sparse.SparseEfficiencyWarning) x2[0, 1] = 0 x2[1, 0] = 0 np.testing.assert_array_equal( unique(x1, return_counts=False), unique(x2, return_counts=False), ) np.testing.assert_array_equal( unique(x1, return_counts=True), unique(x2, return_counts=True), )
def test_unique_explicit_zeros(self): x1 = csr_matrix(np.eye(3)) x2 = csr_matrix(np.eye(3)) # set some of-diagonal to explicit zeros with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=sp.sparse.SparseEfficiencyWarning) x2[0, 1] = 0 x2[1, 0] = 0 np.testing.assert_array_equal( unique(x1, return_counts=False), unique(x2, return_counts=False), ) np.testing.assert_array_equal( unique(x1, return_counts=True), unique(x2, return_counts=True), )
def test_returns_counts(self, array): # pylint: disable=bad-whitespace x = array([[-1., 1., 0., 2., 3., np.nan], [0., 0., 0., 3., 5., 42.], [-1., 0., 0., 1., 7., 6.]]) expected = [-1, 0, 1, 2, 3, 5, 6, 7, 42, np.nan] expected_counts = [2, 6, 2, 1, 2, 1, 1, 1, 1, 1] vals, counts = unique(x, return_counts=True) np.testing.assert_equal(vals, expected) np.testing.assert_equal(counts, expected_counts)
def check_data(self): self.valid_data = False self.Error.sparse_not_supported.clear() if self.data is not None and self.learner is not None: self.Error.data_error.clear() if not self.learner.check_learner_adequacy(self.data.domain): self.Error.data_error(self.learner.learner_adequacy_err_msg) elif not len(self.data): self.Error.data_error("Dataset is empty.") elif len(ut.unique(self.data.Y)) < 2: self.Error.data_error("Data contains a single target value.") elif self.data.X.size == 0: self.Error.data_error("Data has no features to learn from.") elif self.data.is_sparse() and not self.supports_sparse: self.Error.sparse_not_supported() else: self.valid_data = True return self.valid_data
def _init_feature_marker_values(self): self.feature_marker_values = [] cls_index = self.target_class_index instances = Table(self.domain, self.instances) \ if self.instances else None for i, attr in enumerate(self.domain.attributes): value, feature_val = 0, None if len(self.log_reg_coeffs): if attr.is_discrete: ind, n = unique(self.data.X[:, i], return_counts=True) feature_val = np.nan_to_num(ind[np.argmax(n)]) else: feature_val = mean(self.data.X[:, i]) inst_in_dom = instances and attr in instances.domain if inst_in_dom and not np.isnan(instances[0][attr]): feature_val = instances[0][attr] if feature_val is not None: value = self.points[i][cls_index][int(feature_val)] \ if attr.is_discrete else \ self.log_reg_coeffs_orig[i][cls_index][0] * feature_val self.feature_marker_values.append(value)
def check_data(self): self.valid_data = False self.Error.sparse_not_supported.clear() if self.data is not None and self.learner is not None: self.Error.data_error.clear() incompatibility_reason = None for cls in type(self.learner).mro(): if 'incompatibility_reason' in cls.__dict__: # pylint: disable=assignment-from-none incompatibility_reason = \ self.learner.incompatibility_reason(self.data.domain) break if 'check_learner_adequacy' in cls.__dict__: warnings.warn( "check_learner_adequacy is deprecated and will be removed " "in upcoming releases. Learners should instead implement " "the incompatibility_reason method.", OrangeDeprecationWarning) if not self.learner.check_learner_adequacy( self.data.domain): incompatibility_reason = self.learner.learner_adequacy_err_msg break if incompatibility_reason is not None: self.Error.data_error(incompatibility_reason) elif not len(self.data): self.Error.data_error("Dataset is empty.") elif len(ut.unique(self.data.Y)) < 2: self.Error.data_error("Data contains a single target value.") elif self.data.X.size == 0: self.Error.data_error("Data has no features to learn from.") elif self.data.is_sparse() and not self.supports_sparse: self.Error.sparse_not_supported() else: self.valid_data = True return self.valid_data
def get_domain(self, domain, data): """Create domain (and dataset) from changes made in the widget. Parameters ---------- domain : old domain data : source data Returns ------- (new_domain, [attribute_columns, class_var_columns, meta_columns]) """ # Allow type-checking with type() instead of isinstance() for exact comparison # pylint: disable=unidiomatic-typecheck variables = self.model().variables places = [[], [], []] # attributes, class_vars, metas cols = [[], [], []] # Xcols, Ycols, Mcols def numbers_are_round(var, col_data): if type(var) == ContinuousVariable: data = np.asarray(col_data.data) # Works for dense and sparse data = data[~np.isnan(data)] return (data == data.astype(int)).all() return False # Exit early with original domain if the user didn't actually change anything if all((name == orig_var.name and tpe == type(orig_var) and place == orig_plc) for (name, tpe, place, _, _), (orig_var, orig_plc) in \ zip(variables, chain(((at, Place.feature) for at in domain.attributes), ((cl, Place.class_var) for cl in domain.class_vars), ((mt, Place.meta) for mt in domain.metas)))): return domain, [data.X, data.Y, data.metas] for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \ zip(variables, chain([(at, Place.feature) for at in domain.attributes], [(cl, Place.class_var) for cl in domain.class_vars], [(mt, Place.meta) for mt in domain.metas])): if place == Place.skip: continue col_data = self._get_column(data, orig_var, orig_plc) is_sparse = sp.issparse(col_data) if name == orig_var.name and tpe == type(orig_var): var = orig_var elif tpe == type(orig_var): # change the name so that all_vars will get the correct name orig_var.name = name var = orig_var elif tpe == DiscreteVariable: values = list( str(i) for i in unique(col_data) if not self._is_missing(i)) round_numbers = numbers_are_round(orig_var, col_data) col_data = [ np.nan if self._is_missing(x) else values.index(str(x)) for x in self._iter_vals(col_data) ] if round_numbers: values = [str(int(float(v))) for v in values] var = tpe(name, values) col_data = self._to_column(col_data, is_sparse) elif tpe == StringVariable: var = tpe(name) if type(orig_var) == DiscreteVariable: col_data = [ orig_var.repr_val(x) if not np.isnan(x) else "" for x in self._iter_vals(col_data) ] elif type(orig_var) == ContinuousVariable: round_numbers = numbers_are_round(orig_var, col_data) col_data = [ str(int(x)) if round_numbers else orig_var.repr_val(x) if not np.isnan(x) else "" for x in self._iter_vals(col_data) ] # don't obey sparsity for StringVariable since they are # in metas which are transformed to dense below col_data = self._to_column(col_data, False, dtype=object) elif tpe == ContinuousVariable and type( orig_var) == DiscreteVariable: var = tpe(name) if may_be_numeric: col_data = [ np.nan if self._is_missing(x) else float( orig_var.values[int(x)]) for x in self._iter_vals(col_data) ] col_data = self._to_column(col_data, is_sparse) else: var = tpe(name) places[place].append(var) cols[place].append(col_data) # merge columns for X, Y and metas feats = cols[Place.feature] X = self._merge(feats) if len(feats) else np.empty((len(data), 0)) Y = self._merge(cols[Place.class_var], force_dense=True) m = self._merge(cols[Place.meta], force_dense=True) domain = Domain(*places) return domain, [X, Y, m]
def get_domain(self, domain, data): """Create domain (and dataset) from changes made in the widget. Parameters ---------- domain : old domain data : source data Returns ------- (new_domain, [attribute_columns, class_var_columns, meta_columns]) """ # Allow type-checking with type() instead of isinstance() for exact comparison # pylint: disable=unidiomatic-typecheck variables = self.model().variables places = [[], [], []] # attributes, class_vars, metas cols = [[], [], []] # Xcols, Ycols, Mcols def numbers_are_round(var, col_data): if type(var) == ContinuousVariable: data = np.asarray(col_data.data) # Works for dense and sparse data = data[~np.isnan(data)] return (data == data.astype(int)).all() return False # Exit early with original domain if the user didn't actually change anything if all((name == orig_var.name and tpe == type(orig_var) and place == orig_plc) for (name, tpe, place, _, _), (orig_var, orig_plc) in zip(variables, chain(((at, Place.feature) for at in domain.attributes), ((cl, Place.class_var) for cl in domain.class_vars), ((mt, Place.meta) for mt in domain.metas)))): return domain, [data.X, data.Y, data.metas] for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \ zip(variables, chain([(at, Place.feature) for at in domain.attributes], [(cl, Place.class_var) for cl in domain.class_vars], [(mt, Place.meta) for mt in domain.metas])): if place == Place.skip: continue col_data = self._get_column(data, orig_var, orig_plc) is_sparse = sp.issparse(col_data) if name == orig_var.name and tpe == type(orig_var): var = orig_var elif tpe == type(orig_var): # change the name so that all_vars will get the correct name orig_var.name = name var = orig_var elif tpe == DiscreteVariable: values = list(str(i) for i in unique(col_data) if not self._is_missing(i)) round_numbers = numbers_are_round(orig_var, col_data) col_data = [np.nan if self._is_missing(x) else values.index(str(x)) for x in self._iter_vals(col_data)] if round_numbers: values = [str(int(float(v))) for v in values] var = tpe(name, values) col_data = self._to_column(col_data, is_sparse) elif tpe == StringVariable: var = tpe.make(name) if type(orig_var) == DiscreteVariable: col_data = [orig_var.repr_val(x) if not np.isnan(x) else "" for x in self._iter_vals(col_data)] elif type(orig_var) == ContinuousVariable: round_numbers = numbers_are_round(orig_var, col_data) col_data = ['' if np.isnan(x) else str(int(x)) if round_numbers else orig_var.repr_val(x) for x in self._iter_vals(col_data)] # don't obey sparsity for StringVariable since they are # in metas which are transformed to dense below col_data = self._to_column(col_data, False, dtype=object) elif tpe == ContinuousVariable and type(orig_var) == DiscreteVariable: var = tpe.make(name) if may_be_numeric: col_data = [np.nan if self._is_missing(x) else float(orig_var.values[int(x)]) for x in self._iter_vals(col_data)] col_data = self._to_column(col_data, is_sparse) else: var = tpe(name) places[place].append(var) cols[place].append(col_data) # merge columns for X, Y and metas feats = cols[Place.feature] X = self._merge(feats) if len(feats) else np.empty((len(data), 0)) Y = self._merge(cols[Place.class_var], force_dense=True) m = self._merge(cols[Place.meta], force_dense=True) domain = Domain(*places) return domain, [X, Y, m]
def get_domain(self, domain, data): """Create domain (and dataset) from changes made in the widget. Parameters ---------- domain : old domain data : source data Returns ------- (new_domain, [attribute_columns, class_var_columns, meta_columns]) """ # Allow type-checking with type() instead of isinstance() for exact comparison # pylint: disable=unidiomatic-typecheck variables = self.model().variables places = [[], [], []] # attributes, class_vars, metas cols = [[], [], []] # Xcols, Ycols, Mcols for (name, tpe, place, _, _), (orig_var, orig_plc) in \ zip(variables, chain([(at, Place.feature) for at in domain.attributes], [(cl, Place.class_var) for cl in domain.class_vars], [(mt, Place.meta) for mt in domain.metas])): if place == Place.skip: continue col_data = self._get_column(data, orig_var, orig_plc) is_sparse = sp.issparse(col_data) cont_ints = type(orig_var) == ContinuousVariable and \ all(x.is_integer() for x in self._iter_vals(col_data) if not np.isnan(x)) disc_ints = type(orig_var) == DiscreteVariable and \ all(x.isdecimal() for x in orig_var.values) if name == orig_var.name and tpe == type(orig_var): var = orig_var elif tpe == type(orig_var): # change the name so that all_vars will get the correct name orig_var.name = name var = orig_var elif tpe == DiscreteVariable: values = list(str(i) for i in unique(col_data) if not self._is_missing(i)) col_data = [np.nan if self._is_missing(x) else values.index(str(x)) for x in self._iter_vals(col_data)] if cont_ints: values = [str(int(float(v))) for v in values] var = tpe(name, values) col_data = self._to_column(col_data, is_sparse) elif tpe == StringVariable: var = tpe(name) if type(orig_var) == DiscreteVariable: col_data = [orig_var.repr_val(x) if not np.isnan(x) else "" for x in self._iter_vals(col_data)] elif type(orig_var) == ContinuousVariable: col_data = [str(int(x)) if cont_ints else orig_var.repr_val(x) if not np.isnan(x) else "" for x in self._iter_vals(col_data)] # don't obey sparsity for StringVariable since they are # in metas which are transformed to dense below col_data = self._to_column(col_data, False, dtype=object) elif tpe == ContinuousVariable and type(orig_var) == DiscreteVariable: var = tpe(name) if disc_ints: col_data = [np.nan if self._is_missing(x) else float(orig_var.values[int(x)]) for x in self._iter_vals(col_data)] col_data = self._to_column(col_data, is_sparse) else: var = tpe(name) places[place].append(var) cols[place].append(col_data) # merge columns for X, Y and metas feats = cols[Place.feature] X = self._merge(feats) if len(feats) else np.empty((len(data), 0)) Y = self._merge(cols[Place.class_var], force_dense=True) m = self._merge(cols[Place.meta], force_dense=True) domain = Domain(*places) return domain, [X, Y, m]
def get_domain(self, domain, data, deduplicate=False): """ Create domain (and dataset) from changes made in the widget. Returns ------- Args: domain (Domain): original domain data (Table): original data deduplicate (bool): if True, variable names are deduplicated and the result contains an additional list with names of renamed variables Returns: (new_domain, [attribute_columns, class_var_columns, meta_columns]) or (new_domain, [attribute_columns, class_var_columns, meta_columns], renamed) """ # Allow type-checking with type() instead of isinstance() for exact comparison # pylint: disable=unidiomatic-typecheck variables = self.model().variables places = [[], [], []] # attributes, class_vars, metas cols = [[], [], []] # Xcols, Ycols, Mcols def numbers_are_round(var, col_data): if type(var) == ContinuousVariable: data = np.asarray(col_data.data) # Works for dense and sparse data = data[~np.isnan(data)] return (data == data.astype(int)).all() return False # Exit early with original domain if the user didn't actually change anything if all((name == orig_var.name and tpe == type(orig_var) and place == orig_plc) for (name, tpe, place, _, _), (orig_var, orig_plc) in zip( variables, chain(((at, Place.feature) for at in domain.attributes), ( (cl, Place.class_var) for cl in domain.class_vars), ( (mt, Place.meta) for mt in domain.metas)))): if deduplicate: return domain, [data.X, data.Y, data.metas], [] else: return domain, [data.X, data.Y, data.metas] relevant_names = [var[0] for var in variables if var[2] != Place.skip] if deduplicate: renamed_iter = iter(get_unique_names_duplicates(relevant_names)) else: renamed_iter = iter(relevant_names) renamed = [] for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \ zip(variables, chain([(at, Place.feature) for at in domain.attributes], [(cl, Place.class_var) for cl in domain.class_vars], [(mt, Place.meta) for mt in domain.metas])): if place == Place.skip: continue new_name = next(renamed_iter) if new_name != name and name not in renamed: renamed.append(name) col_data = self._get_column(data, orig_var, orig_plc) is_sparse = sp.issparse(col_data) if new_name == orig_var.name and tpe == type(orig_var): var = orig_var elif tpe == type(orig_var): var = orig_var.copy(name=new_name) elif tpe == DiscreteVariable: values = natural_sorted( list( str(i) for i in unique(col_data) if not self._is_missing(i))) round_numbers = numbers_are_round(orig_var, col_data) col_data = [ np.nan if self._is_missing(x) else values.index(str(x)) for x in self._iter_vals(col_data) ] if round_numbers: values = [str(int(float(v))) for v in values] var = tpe(new_name, values) col_data = self._to_column(col_data, is_sparse) elif tpe == StringVariable: var = tpe.make(new_name) if type(orig_var) in [DiscreteVariable, TimeVariable]: col_data = [ orig_var.repr_val(x) if not np.isnan(x) else "" for x in self._iter_vals(col_data) ] elif type(orig_var) == ContinuousVariable: round_numbers = numbers_are_round(orig_var, col_data) col_data = [ '' if np.isnan(x) else str(int(x)) if round_numbers else orig_var.repr_val(x) for x in self._iter_vals(col_data) ] # don't obey sparsity for StringVariable since they are # in metas which are transformed to dense below col_data = self._to_column(col_data, False, dtype=object) elif tpe == ContinuousVariable and type( orig_var) == DiscreteVariable: var = tpe.make(new_name) if may_be_numeric: col_data = [ np.nan if self._is_missing(x) else float( orig_var.values[int(x)]) for x in self._iter_vals(col_data) ] col_data = self._to_column(col_data, is_sparse) else: var = tpe(new_name) places[place].append(var) cols[place].append(col_data) # merge columns for X, Y and metas feats = cols[Place.feature] X = self._merge(feats) if feats else np.empty((len(data), 0)) Y = self._merge(cols[Place.class_var], force_dense=True) m = self._merge(cols[Place.meta], force_dense=True) domain = Domain(*places) if deduplicate: return domain, [X, Y, m], renamed else: return domain, [X, Y, m]