def test_normalize_data(self): # not normalized self.widget.controls.normalize.setChecked(False) data = Table("heart_disease") self.send_signal(self.widget.Inputs.data, data) kwargs = { "eps": self.widget.eps, "min_samples": self.widget.min_samples, "metric": "euclidean" } clusters = DBSCAN(**kwargs)(data) output = self.get_output(self.widget.Outputs.annotated_data) output_clusters = output.metas[:, 0].copy() output_clusters[np.isnan(output_clusters)] = -1 np.testing.assert_array_equal(output_clusters, clusters) # normalized self.widget.controls.normalize.setChecked(True) kwargs = { "eps": self.widget.eps, "min_samples": self.widget.min_samples, "metric": "euclidean" } for pp in (Continuize(), Normalize(), SklImpute()): data = pp(data) clusters = DBSCAN(**kwargs)(data) output = self.get_output(self.widget.Outputs.annotated_data) output_clusters = output.metas[:, 0].copy() output_clusters[np.isnan(output_clusters)] = -1 np.testing.assert_array_equal(output_clusters, clusters)
def set_data(self, data): self.closeContext() self.clear_messages() self.data = data self.cont_data = None self.selection = () if data is not None: if len(data) < 2: self.Warning.not_enough_inst() else: domain = data.domain cont_attrs = [a for a in domain.attributes if a.is_continuous] cont_dom = Domain(cont_attrs, domain.class_vars, domain.metas) cont_data = Table.from_table(cont_dom, data) remover = Remove(Remove.RemoveConstant) cont_data = remover(cont_data) if remover.attr_results["removed"]: self.Information.removed_cons_feat() if len(cont_data.domain.attributes) < 2: self.Warning.not_enough_vars() else: self.cont_data = SklImpute()(cont_data) self.set_feature_model() self.openContext(self.cont_data) self.apply() self.vizrank.button.setEnabled(self.cont_data is not None)
def _preprocess(table): """Remove categorical attributes and impute missing values.""" if not len(table): return table new_domain = data.Domain( [a for a in table.domain.attributes if a.is_continuous], table.domain.class_vars, table.domain.metas) new_data = data.Table(new_domain, table) new_data = SklImpute(new_data) return new_data
def _preprocess(table, impute=True): """Remove categorical attributes and impute missing values.""" if not len(table): return table new_domain = Domain( [a for a in table.domain.attributes if a.is_continuous], table.domain.class_vars, table.domain.metas) new_data = table.transform(new_domain) if impute: new_data = SklImpute()(new_data) return new_data
class EllipticEnvelopeLearner(SklLearner): __wraps__ = skl_covariance.EllipticEnvelope __returns__ = EllipticEnvelopeClassifier preprocessors = [Continuize(), RemoveNaNColumns(), SklImpute()] def __init__(self, store_precision=True, assume_centered=False, support_fraction=None, contamination=0.1, random_state=None, preprocessors=None): super().__init__(preprocessors=preprocessors) self.params = vars()
class PolynomialLearner(Learner): name = 'poly learner' preprocessors = [Continuize(), RemoveNaNColumns(), SklImpute()] def __init__(self, learner, degree=1, preprocessors=None): super().__init__(preprocessors=preprocessors) self.degree = degree self.learner = learner def fit(self, X, Y, W): polyfeatures = skl_preprocessing.PolynomialFeatures(self.degree) X = polyfeatures.fit_transform(X) clf = self.learner if W is None or not self.supports_weights: model = clf.fit(X, Y, None) else: model = clf.fit(X, Y, sample_weight=W.reshape(-1)) return PolynomialModel(model, polyfeatures)
class TreeRegressionLearner(SklLearner): __wraps__ = skl_tree.DecisionTreeRegressor __returns__ = TreeRegressor name = 'regression tree' preprocessors = [RemoveNaNColumns(), SklImpute(), Continuize()] def __init__(self, criterion="mse", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None, random_state=None, max_leaf_nodes=None, preprocessors=None): super().__init__(preprocessors=preprocessors) self.params = vars()
class Clustering(metaclass=WrapperMeta): """ ${skldoc} Additional Orange parameters preprocessors : list, optional (default = [Continuize(), SklImpute()]) An ordered list of preprocessors applied to data before training or testing. """ __wraps__ = None __returns__ = ClusteringModel preprocessors = [Continuize(), SklImpute()] def __init__(self, preprocessors, parameters): self.preprocessors = preprocessors if preprocessors is not None else self.preprocessors self.params = { k: v for k, v in parameters.items() if k not in ["self", "preprocessors", "__class__"] } def __call__(self, data): return self.get_model(data).labels def get_model(self, data): orig_domain = data.domain data = self.preprocess(data) model = self.fit_storage(data) model.domain = data.domain model.original_domain = orig_domain return model def fit_storage(self, data): # only data Table return self.fit(data.X) def fit(self, X: np.ndarray, y: np.ndarray = None): return self.__returns__(self.__wraps__(**self.params).fit(X)) def preprocess(self, data): for pp in self.preprocessors: data = pp(data) return data
def set_data(self, data): self.closeContext() self.clear_messages() self.data = data self.cont_data = None self.selection = () if data is not None: cont_attrs = [a for a in data.domain.attributes if a.is_continuous] if len(cont_attrs) < 2: self.Information.not_enough_vars() elif len(data) < 2: self.Information.not_enough_inst() else: domain = data.domain cont_dom = Domain(cont_attrs, domain.class_vars, domain.metas) self.cont_data = SklImpute()(Table.from_table(cont_dom, data)) self.apply() self.openContext(self.data) self._vizrank_select()
from sklearn.metrics import pairwise_distances from Orange.preprocess import Normalize, Continuize, SklImpute from Orange.widgets import widget, gui from Orange.widgets.utils.slidergraph import SliderGraph from Orange.widgets.settings import Setting from Orange.data import Table, DiscreteVariable from Orange.data.util import get_unique_names from Orange.clustering import DBSCAN from Orange.widgets.utils.annotated_data import ANNOTATED_DATA_SIGNAL_NAME from Orange.widgets.utils.signals import Input, Output from Orange.widgets.widget import Msg DEFAULT_CUT_POINT = 0.1 PREPROCESSORS = [Continuize(), Normalize(), SklImpute()] EPS_BOTTOM_LIMIT = 0.01 def get_kth_distances(data, metric, k=5): """ The function computes the epsilon parameter for DBSCAN through method proposed in the paper. Parameters ---------- data : Orange.data.Table Visualisation coordinates - embeddings metric : callable or str The metric to compute the distance. k : int Number kth observed neighbour
class SklLearner(Learner, metaclass=WrapperMeta): """ ${skldoc} Additional Orange parameters preprocessors : list, optional An ordered list of preprocessors applied to data before training or testing. Defaults to `[RemoveNaNClasses(), Continuize(), SklImpute(), RemoveNaNColumns()]` """ __wraps__ = None __returns__ = SklModel _params = {} preprocessors = default_preprocessors = [ HasClass(), Continuize(), RemoveNaNColumns(), SklImpute() ] @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_sklparams(value) def _get_sklparams(self, values): skllearner = self.__wraps__ if skllearner is not None: spec = inspect.getargs(skllearner.__init__.__code__) # first argument is 'self' assert spec.args[0] == "self" params = { name: values[name] for name in spec.args[1:] if name in values } else: raise TypeError("Wrapper does not define '__wraps__'") return params def preprocess(self, data): data = super().preprocess(data) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): raise ValueError("Wrapped scikit-learn methods do not support " + "multinomial variables.") return data def __call__(self, data): m = super().__call__(data) m.params = self.params return m def fit(self, X, Y, W=None): clf = self.__wraps__(**self.params) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) @property def supports_weights(self): """Indicates whether this learner supports weighted instances. """ return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames def __getattr__(self, item): try: return self.params[item] except (KeyError, AttributeError): raise AttributeError(item) from None # TODO: Disallow (or mirror) __setattr__ for keys in params? def __dir__(self): dd = super().__dir__() return list(sorted(set(dd) | set(self.params.keys())))
class CatBoostLearnerRegression(CatBoostLearner, LearnerRegression): __wraps__ = None __returns__ = CatBoostModel supports_multiclass = True _params = {} learner_adequacy_err_msg = "Continuous class variable expected." preprocessors = default_preprocessors = [ HasClass(), Continuize(), RemoveNaNColumns(), SklImpute() ] def check_learner_adequacy(self, domain): return domain.has_continuous_class @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_sklparams(value) def _get_sklparams(self, values): skllearner = self.__wraps__ if skllearner is not None: spec = inspect.getargs(skllearner.__init__.__code__) # first argument is 'self' assert spec.args[0] == "self" params = { name: values[name] for name in spec.args[1:] if name in values } else: raise TypeError("Wrapper does not define '__wraps__'") return params def preprocess(self, data): data = super().preprocess(data) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): raise ValueError("Wrapped scikit-learn methods do not support " + "multinomial variables.") return data def __call__(self, data): m = super().__call__(data) m.params = self.params return m def fit(self, X, Y, W=None): clf = self.__wraps__(**self.params) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) @property def supports_weights(self): """Indicates whether this learner supports weighted instances. """ return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames def __getattr__(self, item): try: return self.params[item] except (KeyError, AttributeError): raise AttributeError(item) from None # TODO: Disallow (or mirror) __setattr__ for keys in params? def __dir__(self): dd = super().__dir__() return list(sorted(set(dd) | set(self.params.keys())))
class SklLearner(Learner, metaclass=WrapperMeta): """ ${skldoc} Additional Orange parameters preprocessors : list, optional (default=[Continuize(), SklImpute(), RemoveNaNColumns()]) An ordered list of preprocessors applied to data before training or testing. """ __wraps__ = None __returns__ = SklModel _params = None name = 'skl learner' preprocessors = [Continuize(), RemoveNaNColumns(), SklImpute(force=False)] @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_sklparams(value) def _get_sklparams(self, values): skllearner = self.__wraps__ if skllearner is not None: spec = inspect.getargs(skllearner.__init__.__code__) # first argument is 'self' assert spec.args[0] == "self" params = {name: values[name] for name in spec.args[1:] if name in values} else: raise TypeError("Wrapper does not define '__wraps__'") return params def preprocess(self, data): data = super().preprocess(data) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): raise ValueError("Wrapped scikit-learn methods do not support " + "multinomial variables.") return data def __call__(self, data): m = super().__call__(data) m.used_vals = [np.unique(y) for y in data.Y[:, None].T] m.params = self.params return m def fit(self, X, Y, W): clf = self.__wraps__(**self.params) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) def __repr__(self): return '{} {}'.format(self.name, self.params)
def impute(data): """Impute missing values.""" return SklImpute()(data)
def _calculate_table_values(self): genes = self.data.domain.attributes[:self.GENE_MAXIMUM] matrix = self.aggregated_data clusters = self.clusters_unordered if self.transpose: matrix, clusters, genes = self._transpose(matrix, clusters, genes) # create data table since imputation of nan values is required matrix = Table(Domain(genes), matrix) matrix_before_norm = matrix.copy() # for tooltip matrix = SklImpute()(matrix) if self.log_scale: matrix.X = np.log(matrix.X + 1) if self.normalize: matrix.X = self._normalize(matrix.X) # values must be in range [0, 1] for visualisation matrix.X = self._norm_min_max(matrix.X) if self.biclustering: cluster_order, gene_order = self.cluster_data(matrix) else: cluster_order, gene_order = np.arange(matrix.X.shape[0]), np.arange(matrix.X.shape[1]) # reorder self.matrix = matrix[cluster_order][:, gene_order] self.matrix_before_norm = matrix_before_norm[cluster_order][:, gene_order] self.clusters = clusters[cluster_order] self._refresh_table() self._update_selection() self._invalidate()