def test_chain(self): zoo = Orange.data.Table('zoo') zoo_c = Continuize(zoo) pca = PCA()(zoo_c)(zoo) pca2 = PCA()(zoo_c)(zoo_c) pca3 = PCA(preprocessors=[Continuize()])(zoo)(zoo) np.testing.assert_almost_equal(pca.X, pca2.X) np.testing.assert_almost_equal(pca.X, pca3.X)
def test_chain(self): zoo_c = Continuize()(self.zoo) pca = PCA(n_components=3)(zoo_c)(self.zoo) pca2 = PCA(n_components=3)(zoo_c)(zoo_c) pp = [Continuize()] pca3 = PCA(n_components=3, preprocessors=pp)(self.zoo)(self.zoo) np.testing.assert_almost_equal(pca.X, pca2.X) np.testing.assert_almost_equal(pca.X, pca3.X)
def __init__(self, preprocessors=None, penalty=1, opt_penalty=False, rule_learner=None, basic_attributes=True, fit_intercept=True, intercept_scaling=2, penalize_rules=True): """ Parameters ---------- preprocessors : A sequence of data preprocessors to apply on data prior to fitting the model. penalty : L2-penalty in loss function. rule_learner: Rule learner used to construct new attributes. fit_intercept: Should we add a constant column to data? intercept_scaling: Value of constant in the intercept column. Note that intercept column is appended after normalization, therefore higher values will be less affected by penalization. """ super().__init__(preprocessors) self.penalty = penalty self.opt_penalty = opt_penalty self.rule_learner = rule_learner self.fit_intercept = fit_intercept self.intercept_scaling = intercept_scaling self.basic_attributes = basic_attributes self.penalize_rules = penalize_rules # Post rule learning preprocessing should not decrease the # number of examples. self.post_rule_preprocess = [Normalize(), Continuize()]
def __new__(cls, data, address='localhost:9465', batch=100, max_iter=100): from orangecontrib.remote import aborted, save_state import Orange.data.sql.table cont = Continuize(multinomial_treatment=Continuize.Remove, normalize_continuous=None) data = cont(data) pca = Orange.projection.IncrementalPCA() percent = batch / data.approx_len() * 100 if percent < 100: data_sample = data.sample_percentage(percent, no_cache=True) else: data_sample = data data_sample.download_data(1000000) data_sample = Orange.data.Table.from_numpy( Orange.data.Domain(data_sample.domain.attributes), data_sample.X) model = pca(data_sample) save_state(model) for i in range(max_iter if percent < 100 else 0): data_sample = data.sample_percentage(percent, no_cache=True) data_sample.download_data(1000000) data_sample = Orange.data.Table.from_numpy( Orange.data.Domain(data_sample.domain.attributes), data_sample.X) model.partial_fit(data_sample) model.iteration = i save_state(model) if aborted(): break return model
def test_normalize_data(self): # not normalized self.widget.controls.normalize.setChecked(False) data = Table("heart_disease") self.send_signal(self.widget.Inputs.data, data) kwargs = { "eps": self.widget.eps, "min_samples": self.widget.min_samples, "metric": "euclidean" } clusters = DBSCAN(**kwargs)(data) output = self.get_output(self.widget.Outputs.annotated_data) output_clusters = output.metas[:, 0].copy() output_clusters[np.isnan(output_clusters)] = -1 np.testing.assert_array_equal(output_clusters, clusters) # normalized self.widget.controls.normalize.setChecked(True) kwargs = { "eps": self.widget.eps, "min_samples": self.widget.min_samples, "metric": "euclidean" } for pp in (Continuize(), Normalize(), SklImpute()): data = pp(data) clusters = DBSCAN(**kwargs)(data) output = self.get_output(self.widget.Outputs.annotated_data) output_clusters = output.metas[:, 0].copy() output_clusters[np.isnan(output_clusters)] = -1 np.testing.assert_array_equal(output_clusters, clusters)
def test_transform_changed_domain(self): """ 1. Open data, apply some preprocessor, splits the data into two parts, use LDA on the first part, and then transform the second part. 2. Open data, split into two parts, apply the same preprocessor and LDA only on the first part, and then transform the second part. The transformed second part in (1) and (2) has to be the same. """ data = Table("iris") data = Randomize()(data) preprocessor = Continuize() lda = LDA() # normalize all ndata = preprocessor(data) model = lda(ndata[:75]) result_1 = model(ndata[75:]) # normalize only the "training" part ndata = preprocessor(data[:75]) model = lda(ndata) result_2 = model(data[75:]) np.testing.assert_almost_equal(result_1.X, result_2.X)
def test_transform_changed_domain(self): """ 1. Open data, apply some preprocessor, splits the data into two parts, use FreeViz on the first part, and then transform the second part. 2. Open data, split into two parts, apply the same preprocessor and FreeViz only on the first part, and then transform the second part. The transformed second part in (1) and (2) has to be the same. """ data = Table("titanic")[::10] normalize = Continuize() freeviz = FreeViz(maxiter=40) # normalize all ndata = normalize(data) model = freeviz(ndata[:100]) result_1 = model(ndata[100:]) # normalize only the "training" part ndata = normalize(data[:100]) model = freeviz(ndata) result_2 = model(data[100:]) np.testing.assert_almost_equal(result_1.X, result_2.X)
def test_information_message(self): self.widget.set_row_clustering(Clustering.OrderedClustering) continuizer = Continuize() cont_titanic = continuizer(self.titanic) self.send_signal(self.widget.Inputs.data, cont_titanic) self.assertTrue(self.widget.Information.active) self.send_signal(self.widget.Inputs.data, self.data) self.assertFalse(self.widget.Information.active)
def test_preprocessor_chaining(self): domain = Domain([DiscreteVariable("a", values="01"), DiscreteVariable("b", values="01")], DiscreteVariable("y", values="01")) table = Table.from_list(domain, [[0, 1], [1, np.NaN]], [0, 1]) pre1 = Continuize()(Impute()(table)) pre2 = table.transform(pre1.domain) np.testing.assert_almost_equal(pre1.X, pre2.X)
def test_attr_label_metas(self, timeout=DEFAULT_TIMEOUT): """Set 'Label' from string meta attribute""" cont = Continuize(multinomial_treatment=Continuize.AsOrdinal) data = cont(Table("zoo")) self.send_signal(self.widget.Inputs.data, data) self.wait_until_finished(timeout=timeout) simulate.combobox_activate_item(self.widget.controls.attr_label, data.domain[-1].name)
def test_information_message(self): self.widget.controls.row_clustering.setChecked(True) continuizer = Continuize() cont_titanic = continuizer(self.titanic) self.send_signal("Data", cont_titanic) self.assertTrue(self.widget.Information.active) self.send_signal("Data", self.data) self.assertFalse(self.widget.Information.active)
def test_callback(self): callback = unittest.mock.Mock() learner = DummySklLearner(preprocessors=[Continuize(), Randomize()]) learner(Table("iris"), callback) args = [x[0][0] for x in callback.call_args_list] self.assertEqual(min(args), 0) self.assertEqual(max(args), 1) self.assertListEqual(args, sorted(args))
def test_information_message(self): self.widget.sort_rows = self.widget.OrderedClustering continuizer = Continuize() cont_titanic = continuizer(self.titanic) self.send_signal("Data", cont_titanic) self.assertTrue(self.widget.Information.active) self.send_signal("Data", self.iris) self.assertFalse(self.widget.Information.active)
def test_discrete_expression(self): data = Table("heart_disease") attrs = data.domain.attributes domain = Domain(attrs[1:4], attrs[4]) data = data.transform(domain) self.send_signal(self.widget.Inputs.preprocessor, Continuize()) self.__init_widget(data) self.assertEqual(self.widget.expression, "p1 + gender_female") self.assertIsNotNone(self.get_output(self.widget.Outputs.model))
def test_retain_all_data(self): data = Table("zoo") cont_data = Continuize()(data) self.send_signal(self.widget.Inputs.data, data) self.send_signal(self.widget.Inputs.template_data, cont_data) self.widget.controls.retain_all_data.click() output = self.get_output(self.widget.Outputs.transformed_data) self.assertIsInstance(output, Table) self.assertEqual(output.X.shape, (len(data), 16)) self.assertEqual(output.metas.shape, (len(data), 38))
def test_attr_label_metas(self, timeout=DEFAULT_TIMEOUT): """Set 'Label' from string meta attribute""" cont = Continuize(multinomial_treatment=Continuize.AsOrdinal) data = cont(Table("zoo")) self.send_signal(self.widget.Inputs.data, data) if self.widget.isBlocking(): spy = QSignalSpy(self.widget.blockingStateChanged) self.assertTrue(spy.wait(timeout)) simulate.combobox_activate_item(self.widget.controls.attr_label, data.domain[-1].name)
class EllipticEnvelopeLearner(SklLearner): __wraps__ = skl_covariance.EllipticEnvelope __returns__ = EllipticEnvelopeClassifier preprocessors = [Continuize(), RemoveNaNColumns(), SklImpute()] def __init__(self, store_precision=True, assume_centered=False, support_fraction=None, contamination=0.1, random_state=None, preprocessors=None): super().__init__(preprocessors=preprocessors) self.params = vars()
class XGBBase(SklLearner): """Base class for xgboost (classification and regression) learners """ preprocessors = default_preprocessors = [ HasClass(), Continuize(), RemoveNaNColumns(), ] def __init__(self, preprocessors=None, **kwargs): super().__init__(preprocessors=preprocessors) self.params = kwargs @SklLearner.params.setter def params(self, values: Dict): self._params = values
class TreeRegressionLearner(SklLearner): __wraps__ = skl_tree.DecisionTreeRegressor __returns__ = TreeRegressor name = 'regression tree' preprocessors = [RemoveNaNColumns(), SklImpute(), Continuize()] def __init__(self, criterion="mse", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None, random_state=None, max_leaf_nodes=None, preprocessors=None): super().__init__(preprocessors=preprocessors) self.params = vars()
class PolynomialLearner(Learner): name = 'poly learner' preprocessors = [Continuize(), RemoveNaNColumns(), SklImpute()] def __init__(self, learner, degree=1, preprocessors=None): super().__init__(preprocessors=preprocessors) self.degree = degree self.learner = learner def fit(self, X, Y, W): polyfeatures = skl_preprocessing.PolynomialFeatures(self.degree) X = polyfeatures.fit_transform(X) clf = self.learner if W is None or not self.supports_weights: model = clf.fit(X, Y, None) else: model = clf.fit(X, Y, sample_weight=W.reshape(-1)) return PolynomialModel(model, polyfeatures)
def __new__(cls, data, batch=100, max_iter=100): cont = Continuize(multinomial_treatment=Continuize.Remove) data = cont(data) model = Orange.projection.IncrementalPCA() percent = batch / data.approx_len() * 100 for i in range(max_iter): data_sample = data.sample_percentage(percent, no_cache=True) if not data_sample: continue data_sample.download_data(1000000) data_sample = Orange.data.Table.from_numpy( Orange.data.Domain(data_sample.domain.attributes), data_sample.X) model = model.partial_fit(data_sample) model.iteration = i save_state(model) if aborted() or data_sample is data: break return model
class Clustering(metaclass=WrapperMeta): """ ${skldoc} Additional Orange parameters preprocessors : list, optional (default = [Continuize(), SklImpute()]) An ordered list of preprocessors applied to data before training or testing. """ __wraps__ = None __returns__ = ClusteringModel preprocessors = [Continuize(), SklImpute()] def __init__(self, preprocessors, parameters): self.preprocessors = preprocessors if preprocessors is not None else self.preprocessors self.params = { k: v for k, v in parameters.items() if k not in ["self", "preprocessors", "__class__"] } def __call__(self, data): return self.get_model(data).labels def get_model(self, data): orig_domain = data.domain data = self.preprocess(data) model = self.fit_storage(data) model.domain = data.domain model.original_domain = orig_domain return model def fit_storage(self, data): # only data Table return self.fit(data.X) def fit(self, X: np.ndarray, y: np.ndarray = None): return self.__returns__(self.__wraps__(**self.params).fit(X)) def preprocess(self, data): for pp in self.preprocessors: data = pp(data) return data
def test_discrete_features(self): combo = self.widget.controls._feature model = combo.model() disc_housing = Discretize()(self.housing) self.send_signal(self.widget.Inputs.data, disc_housing) self.assertEqual(model.rowCount(), 1) self.assertTrue(self.widget.Error.data_error.is_shown()) continuizer = Continuize() self.send_signal(self.widget.Inputs.preprocessor, continuizer) self.assertGreater(model.rowCount(), 1) self.assertFalse(self.widget.Error.data_error.is_shown()) self.send_signal(self.widget.Inputs.preprocessor, None) self.assertEqual(model.rowCount(), 1) self.assertTrue(self.widget.Error.data_error.is_shown()) self.send_signal(self.widget.Inputs.data, None) self.assertEqual(model.rowCount(), 1) self.assertFalse(self.widget.Error.data_error.is_shown())
def test_reconstruct_domain(self): data = Table("heart_disease") cls = LogisticRegressionLearner()(data) domain = OWNomogram.reconstruct_domain(cls, cls.domain) transformed_data = cls.original_data.transform(domain) self.assertEqual(transformed_data.X.shape, data.X.shape) self.assertFalse(np.isnan(transformed_data.X[0]).any()) scaled_data = Scale()(data) cls = LogisticRegressionLearner()(scaled_data) domain = OWNomogram.reconstruct_domain(cls, cls.domain) transformed_data = cls.original_data.transform(domain) self.assertEqual(transformed_data.X.shape, scaled_data.X.shape) self.assertFalse(np.isnan(transformed_data.X[0]).any()) disc_data = Continuize()(data) cls = LogisticRegressionLearner()(disc_data) domain = OWNomogram.reconstruct_domain(cls, cls.domain) transformed_data = cls.original_data.transform(domain) self.assertEqual(transformed_data.X.shape, disc_data.X.shape) self.assertFalse(np.isnan(transformed_data.X[0]).any())
def apply(self): transformed = components = None if self.data is not None: self.data = Continuize(Impute(self.data)) lda = LinearDiscriminantAnalysis(solver='eigen', n_components=2) X = lda.fit_transform(self.data.X, self.data.Y) dom = Domain([ ContinuousVariable('Component_1'), ContinuousVariable('Component_2') ], self.data.domain.class_vars, self.data.domain.metas) transformed = Table(dom, X, self.data.Y, self.data.metas) transformed.name = self.data.name + ' (LDA)' dom = Domain(self.data.domain.attributes, metas=[StringVariable(name='component')]) metas = np.array([[ 'Component_{}'.format(i + 1) for i in range(lda.scalings_.shape[1]) ]], dtype=object).T components = Table(dom, lda.scalings_.T, metas=metas) components.name = 'components' self.send("Transformed data", transformed) self.send("Components", components)
def createinstance(params): params = dict(params) treatment = params.pop("multinomial_treatment", Continuize.Indicators) return Continuize(multinomial_treatment=treatment)
from sklearn.metrics import pairwise_distances from Orange.preprocess import Normalize, Continuize, SklImpute from Orange.widgets import widget, gui from Orange.widgets.utils.slidergraph import SliderGraph from Orange.widgets.settings import Setting from Orange.data import Table, DiscreteVariable from Orange.data.util import get_unique_names from Orange.clustering import DBSCAN from Orange.widgets.utils.annotated_data import ANNOTATED_DATA_SIGNAL_NAME from Orange.widgets.utils.signals import Input, Output from Orange.widgets.widget import Msg DEFAULT_CUT_POINT = 0.1 PREPROCESSORS = [Continuize(), Normalize(), SklImpute()] EPS_BOTTOM_LIMIT = 0.01 def get_kth_distances(data, metric, k=5): """ The function computes the epsilon parameter for DBSCAN through method proposed in the paper. Parameters ---------- data : Orange.data.Table Visualisation coordinates - embeddings metric : callable or str The metric to compute the distance. k : int Number kth observed neighbour
class SoftmaxLearner(Learner): """ Implementation of softmax regression with k*(n+1) parameters trained using L-BFGS optimization. """ name = 'softmax' preprocessors = [ RemoveNaNClasses(), Normalize(), Continuize(), Impute(), RemoveNaNColumns() ] def __init__(self, preprocessors=None): super().__init__(preprocessors=preprocessors) def mysigma(self, x): """ My softmax function. Always check that you provide correctly oriented data (ignore - solved with slicing). I subtracted max value to prevent overflow at calculation of exponent - it may cause undeflow, but that is not a problem. """ tmpx = np.exp(x - np.max(x, axis=1)[:, None]) return tmpx / np.sum(tmpx, axis=1)[:, None] def cost(self, theta, X, y): """ Args: theta (np.ndarray): model parameters of shape [n_classes * n_features] X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] Returns: float: The value of cost function evaluated with given parameters. """ ################################################################################################# # Theta pretvorim iz dolgega vektorja v matricno obliko, nato pripravim indikatorsko funkcijo ################################################################################################# theta = theta.reshape((-1, X.shape[1])) indicator = np.identity(theta.shape[0])[y.astype(int)] return -(np.sum(indicator * np.log(self.mysigma(X.dot(theta.T))))) def grad(self, theta, X, y): """ Args: theta (np.ndarray): model parameters of shape [n_classes * n_features] X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] Returns: np.ndarray: Gradients wrt. all model's parameters of shape [n_classes * n_features] """ theta = theta.reshape((-1, X.shape[1])) indicator = np.identity(theta.shape[0])[y.astype(int)] return -(X.T.dot( (indicator - self.mysigma(X.dot(theta.T))))).T.flatten() def approx_grad(self, theta, X, y, eps=1e-5): """ Args: theta (np.ndarray): model parameters of shape [n_classes * n_features] X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] eps (float): value offset for gradient estimation Returns: np.ndarray: Gradients wrt. all model's parameters of shape [n_classes * n_features] """ result = [] for i in range(len(theta)): crr = np.zeros(len(theta)) crr[i] = 1 result.append((self.cost(theta + (crr * eps), X, y) - self.cost(theta - (crr * eps), X, y)) / (2 * eps)) return np.array(result) def fit(self, X, y, W=None): """ Args: X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] W (np.ndarray): Orange weights - ignore for this exercise Returns: SoftmaxModel: Orange's classification model """ num_classes = len( np.unique(y)) # predpostavljamo da so vsi razredi prisotni X = np.column_stack((np.ones(X.shape[0]), X)) theta = np.ones(num_classes * X.shape[1]) * 1e-9 result = fmin_l_bfgs_b(self.cost, theta, self.grad, args=(X, y))[0] return SoftmaxModel(result.reshape((-1, X.shape[1])))
class CatBoostLearnerRegression(CatBoostLearner, LearnerRegression): __wraps__ = None __returns__ = CatBoostModel supports_multiclass = True _params = {} learner_adequacy_err_msg = "Continuous class variable expected." preprocessors = default_preprocessors = [ HasClass(), Continuize(), RemoveNaNColumns(), SklImpute() ] def check_learner_adequacy(self, domain): return domain.has_continuous_class @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_sklparams(value) def _get_sklparams(self, values): skllearner = self.__wraps__ if skllearner is not None: spec = inspect.getargs(skllearner.__init__.__code__) # first argument is 'self' assert spec.args[0] == "self" params = { name: values[name] for name in spec.args[1:] if name in values } else: raise TypeError("Wrapper does not define '__wraps__'") return params def preprocess(self, data): data = super().preprocess(data) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): raise ValueError("Wrapped scikit-learn methods do not support " + "multinomial variables.") return data def __call__(self, data): m = super().__call__(data) m.params = self.params return m def fit(self, X, Y, W=None): clf = self.__wraps__(**self.params) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) @property def supports_weights(self): """Indicates whether this learner supports weighted instances. """ return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames def __getattr__(self, item): try: return self.params[item] except (KeyError, AttributeError): raise AttributeError(item) from None # TODO: Disallow (or mirror) __setattr__ for keys in params? def __dir__(self): dd = super().__dir__() return list(sorted(set(dd) | set(self.params.keys())))
class SklLearner(Learner, metaclass=WrapperMeta): """ ${skldoc} Additional Orange parameters preprocessors : list, optional An ordered list of preprocessors applied to data before training or testing. Defaults to `[RemoveNaNClasses(), Continuize(), SklImpute(), RemoveNaNColumns()]` """ __wraps__ = None __returns__ = SklModel _params = {} preprocessors = default_preprocessors = [ HasClass(), Continuize(), RemoveNaNColumns(), SklImpute() ] @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_sklparams(value) def _get_sklparams(self, values): skllearner = self.__wraps__ if skllearner is not None: spec = inspect.getargs(skllearner.__init__.__code__) # first argument is 'self' assert spec.args[0] == "self" params = { name: values[name] for name in spec.args[1:] if name in values } else: raise TypeError("Wrapper does not define '__wraps__'") return params def preprocess(self, data): data = super().preprocess(data) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): raise ValueError("Wrapped scikit-learn methods do not support " + "multinomial variables.") return data def __call__(self, data): m = super().__call__(data) m.params = self.params return m def fit(self, X, Y, W=None): clf = self.__wraps__(**self.params) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) @property def supports_weights(self): """Indicates whether this learner supports weighted instances. """ return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames def __getattr__(self, item): try: return self.params[item] except (KeyError, AttributeError): raise AttributeError(item) from None # TODO: Disallow (or mirror) __setattr__ for keys in params? def __dir__(self): dd = super().__dir__() return list(sorted(set(dd) | set(self.params.keys())))