def set_test_data(self, data): """ Set the input separate testing dataset. """ self.error(1) self.information(1) if data and not data.domain.class_var: self.error(1, "Test data input requires a class variable") data = None if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) else: self.information(1, "Test data has been sampled") data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(AUTO_DL_LIMIT, partial=True) data = Table(data_sample) self.warning(4) self.test_data_missing_vals = data is not None and \ np.isnan(data.Y).any() if self.train_data_missing_vals or self.test_data_missing_vals: self.warning( 4, self._get_missing_data_warning(self.train_data_missing_vals, self.test_data_missing_vals)) if data: data = RemoveNaNClasses(data) self.test_data = data if self.resampling == OWTestLearners.TestOnTest: self._invalidate()
def set_train_data(self, data): """ Set the input training dataset. Parameters ---------- data : Optional[Orange.data.Table] """ self.Information.data_sampled.clear() self.Error.train_data_empty.clear() self.Error.class_required.clear() self.Error.too_many_classes.clear() self.Error.only_one_class_var_value.clear() if data is not None and not len(data): self.Error.train_data_empty() data = None if data: conds = [ not data.domain.class_vars, len(data.domain.class_vars) > 1, data.domain.has_discrete_class and len(data.domain.class_var.values) == 1 ] errors = [ self.Error.class_required, self.Error.too_many_classes, self.Error.only_one_class_var_value ] for cond, error in zip(conds, errors): if cond: error() data = None break if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) else: self.Information.data_sampled() data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(AUTO_DL_LIMIT, partial=True) data = Table(data_sample) self.train_data_missing_vals = \ data is not None and np.isnan(data.Y).any() if self.train_data_missing_vals or self.test_data_missing_vals: self.Warning.missing_data(self._which_missing_data()) if data: data = RemoveNaNClasses(data) else: self.Warning.missing_data.clear() self.data = data self.closeContext() self._update_scorers() self._update_controls() if data is not None: self._update_class_selection() self.openContext(data.domain) if self.fold_feature_selected and bool(self.feature_model): self.resampling = OWTestLearners.FeatureFold self._invalidate()
def set_train_data(self, data): """ Set the input training dataset. """ self.error(0) self.information(0) if data and not data.domain.class_var: self.error(0, "Train data input requires a class variable") data = None if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) else: self.information(0, "Train data has been sampled") data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(AUTO_DL_LIMIT, partial=True) data = Table(data_sample) self.warning(4) self.train_data_missing_vals = data is not None and \ np.isnan(data.Y).any() if self.train_data_missing_vals or self.test_data_missing_vals: self.warning(4, self._get_missing_data_warning( self.train_data_missing_vals, self.test_data_missing_vals )) if data: data = RemoveNaNClasses(data) self.data = data self.closeContext() if data is not None: self._update_class_selection() self.openContext(data.domain.class_var) self._invalidate()
def set_test_data(self, data): """ Set the input separate testing dataset. """ self.Information.test_data_sampled.clear() if data and not data.domain.class_var: self.Error.class_required() data = None else: self.Error.class_required_test.clear() if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) else: self.Information.test_data_sampled() data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(AUTO_DL_LIMIT, partial=True) data = Table(data_sample) self.test_data_missing_vals = \ data is not None and np.isnan(data.Y).any() if self.train_data_missing_vals or self.test_data_missing_vals: self.Warning.missing_data(self._which_missing_data()) if data: data = RemoveNaNClasses(data) else: self.Warning.missing_data.clear() self.test_data = data if self.resampling == OWTestLearners.TestOnTest: self._invalidate()
def set_train_data(self, data): """ Set the input training dataset. """ self.Information.data_sampled.clear() if data and not data.domain.class_var: self.Error.class_required() data = None else: self.Error.class_required.clear() if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) else: self.Information.data_sampled() data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(AUTO_DL_LIMIT, partial=True) data = Table(data_sample) self.train_data_missing_vals = \ data is not None and np.isnan(data.Y).any() if self.train_data_missing_vals or self.test_data_missing_vals: self.Warning.missing_data(self._which_missing_data()) if data: data = RemoveNaNClasses(data) else: self.Warning.missing_data.clear() self.data = data self.closeContext() if data is not None: self._update_class_selection() self.openContext(data.domain.class_var) self._invalidate()
def test_remove_nan_classes_multiclass(self): domain = Domain([DiscreteVariable("a", values="01")], [DiscreteVariable("b", values="01"), DiscreteVariable("c", values="01")]) table = Table(domain, [[0, 1, np.nan], [1, np.nan, 0], [1, 0, 1], [1, np.nan, np.nan]]) table = RemoveNaNClasses(table) self.assertTrue(not np.isnan(table).any()) self.assertEqual(table.domain, domain) self.assertEqual(len(table), 1)
def set_train_data(self, data): """ Set the input training dataset. """ self.Information.data_sampled.clear() self.Error.train_data_empty.clear() if data is not None and not len(data): self.Error.train_data_empty() data = None if data and not data.domain.class_vars: self.Error.class_required() data = None elif data and len(data.domain.class_vars) > 1: self.Error.too_many_classes() data = None else: self.Error.class_required.clear() self.Error.too_many_classes.clear() if isinstance(data, SqlTable): if data.approx_len() < AUTO_DL_LIMIT: data = Table(data) else: self.Information.data_sampled() data_sample = data.sample_time(1, no_cache=True) data_sample.download_data(AUTO_DL_LIMIT, partial=True) data = Table(data_sample) self.train_data_missing_vals = \ data is not None and np.isnan(data.Y).any() if self.train_data_missing_vals or self.test_data_missing_vals: self.Warning.missing_data(self._which_missing_data()) if data: data = RemoveNaNClasses(data) else: self.Warning.missing_data.clear() self.data = data self.closeContext() self._update_controls() if data is not None: self._update_class_selection() self.openContext(data.domain) if self.fold_feature_selected and bool(self.feature_model): self.resampling = OWTestLearners.FeatureFold self._invalidate()
def commit(self): alpha = self.alphas[self.alpha_index] preprocessors = self.preprocessors if self.data is not None and np.isnan(self.data.Y).any(): self.warning(0, "Missing values of target variable(s)") if not self.preprocessors: if self.reg_type == OWLinearRegression.OLS: preprocessors = LinearRegressionLearner.preprocessors elif self.reg_type == OWLinearRegression.Ridge: preprocessors = RidgeRegressionLearner.preprocessors else: preprocessors = LassoRegressionLearner.preprocessors else: preprocessors = list(self.preprocessors) preprocessors.append(RemoveNaNClasses()) args = {"preprocessors": preprocessors} if self.reg_type == OWLinearRegression.OLS: learner = LinearRegressionLearner(**args) elif self.reg_type == OWLinearRegression.Ridge: learner = RidgeRegressionLearner(alpha=alpha, **args) elif self.reg_type == OWLinearRegression.Lasso: learner = LassoRegressionLearner(alpha=alpha, **args) learner.name = self.learner_name predictor = None coef_table = None self.error(0) if self.data is not None: if not learner.check_learner_adequacy(self.data.domain): self.error(0, learner.learner_adequacy_err_msg) else: predictor = learner(self.data) predictor.name = self.learner_name domain = Domain( [ContinuousVariable("coef", number_of_decimals=7)], metas=[StringVariable("name")]) coefs = [predictor.intercept] + list(predictor.coefficients) names = ["intercept"] + \ [attr.name for attr in predictor.domain.attributes] coef_table = Table(domain, list(zip(coefs, names))) coef_table.name = "coefficients" self.send("Linear Regression", learner) self.send("Model", predictor) self.send("Coefficients", coef_table)
class TreeLearner(SklLearner): __wraps__ = skl_tree.DecisionTreeClassifier __returns__ = TreeClassifier name = 'tree' preprocessors = [ RemoveNaNClasses(), RemoveNaNColumns(), SklImpute(), Continuize() ] def __init__(self, criterion="gini", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None, random_state=None, max_leaf_nodes=None, preprocessors=None): super().__init__(preprocessors=preprocessors) self.params = vars()
class SoftmaxLearner(Learner): """ Implementation of softmax regression with k*(n+1) parameters trained using L-BFGS optimization. """ name = 'softmax' preprocessors = [ RemoveNaNClasses(), Normalize(), Continuize(), Impute(), RemoveNaNColumns() ] def __init__(self, preprocessors=None): super().__init__(preprocessors=preprocessors) def mysigma(self, x): """ My softmax function. Always check that you provide correctly oriented data (ignore - solved with slicing). I subtracted max value to prevent overflow at calculation of exponent - it may cause undeflow, but that is not a problem. """ tmpx = np.exp(x - np.max(x, axis=1)[:, None]) return tmpx / np.sum(tmpx, axis=1)[:, None] def cost(self, theta, X, y): """ Args: theta (np.ndarray): model parameters of shape [n_classes * n_features] X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] Returns: float: The value of cost function evaluated with given parameters. """ ################################################################################################# # Theta pretvorim iz dolgega vektorja v matricno obliko, nato pripravim indikatorsko funkcijo ################################################################################################# theta = theta.reshape((-1, X.shape[1])) indicator = np.identity(theta.shape[0])[y.astype(int)] return -(np.sum(indicator * np.log(self.mysigma(X.dot(theta.T))))) def grad(self, theta, X, y): """ Args: theta (np.ndarray): model parameters of shape [n_classes * n_features] X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] Returns: np.ndarray: Gradients wrt. all model's parameters of shape [n_classes * n_features] """ theta = theta.reshape((-1, X.shape[1])) indicator = np.identity(theta.shape[0])[y.astype(int)] return -(X.T.dot( (indicator - self.mysigma(X.dot(theta.T))))).T.flatten() def approx_grad(self, theta, X, y, eps=1e-5): """ Args: theta (np.ndarray): model parameters of shape [n_classes * n_features] X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] eps (float): value offset for gradient estimation Returns: np.ndarray: Gradients wrt. all model's parameters of shape [n_classes * n_features] """ result = [] for i in range(len(theta)): crr = np.zeros(len(theta)) crr[i] = 1 result.append((self.cost(theta + (crr * eps), X, y) - self.cost(theta - (crr * eps), X, y)) / (2 * eps)) return np.array(result) def fit(self, X, y, W=None): """ Args: X (np.ndarray): data of shape [n_examples, n_features] y (np.ndarray): target variable of shape [n_examples] W (np.ndarray): Orange weights - ignore for this exercise Returns: SoftmaxModel: Orange's classification model """ num_classes = len( np.unique(y)) # predpostavljamo da so vsi razredi prisotni X = np.column_stack((np.ones(X.shape[0]), X)) theta = np.ones(num_classes * X.shape[1]) * 1e-9 result = fmin_l_bfgs_b(self.cost, theta, self.grad, args=(X, y))[0] return SoftmaxModel(result.reshape((-1, X.shape[1])))
def test_remove_nan_classes(self): table = Table("imports-85") self.assertTrue(np.isnan(table.Y).any()) table = RemoveNaNClasses(table) self.assertTrue(not np.isnan(table.Y).any())
class LinearRegressionLearner(Learner): '''L2 regularized linear regression (a.k.a Ridge regression) This model uses the L-BFGS algorithm to minimize the linear least squares penalty with L2 regularization. When using this model you should: - Choose a suitable regularization parameter lambda_ - Consider appending a column of ones to the dataset (intercept term) Parameters ---------- lambda\_ : float, optional (default=1.0) Regularization parameter. It controls trade-off between fitting the data and keeping parameters small. Higher values of lambda\_ force parameters to be smaller. preprocessors : list, optional (default="[Normalize(), Continuize(), Impute(), RemoveNaNColumns()]) Preprocessors are applied to data before training or testing. Default preprocessors - transform the dataset so that the columns are on a similar scale, - continuize all discrete attributes, - remove columns with all values as NaN - replace NaN values with suitable values fmin_args : dict, optional Parameters for L-BFGS algorithm. """ Examples -------- import numpy as np from Orange.data import Table from Orange.regression.linear_bfgs import LinearRegressionLearner data = Table('housing') data.X = np.hstack((data.X, np.ones((data.X.shape[0], 1)))) # append ones m = LinearRegressionLearner(lambda_=1.0) c = m(data) # fit print(c(data)) # predict ''' name = 'linear_bfgs' preprocessors = [ RemoveNaNClasses(), Normalize(), Continuize(), Impute(), RemoveNaNColumns() ] def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args): super().__init__(preprocessors=preprocessors) self.lambda_ = lambda_ self.fmin_args = fmin_args def cost_grad(self, theta, X, y): t = X.dot(theta) - y cost = t.dot(t) cost += self.lambda_ * theta.dot(theta) cost /= 2.0 * X.shape[0] grad = X.T.dot(t) grad += self.lambda_ * theta grad /= X.shape[0] return cost, grad def fit(self, X, Y, W): if len(Y.shape) > 1 and Y.shape[1] > 1: raise ValueError('Linear regression does not support ' 'multi-target classification') if np.isnan(np.sum(X)) or np.isnan(np.sum(Y)): raise ValueError('Linear regression does not support ' 'unknown values') theta = np.zeros(X.shape[1]) theta, cost, ret = fmin_l_bfgs_b(self.cost_grad, theta, args=(X, Y.ravel()), **self.fmin_args) return LinearRegressionModel(theta)
class SklLearner(_ReprableWithParams, Learner, metaclass=WrapperMeta): """ ${skldoc} Additional Orange parameters preprocessors : list, optional (default=[RemoveNaNClasses(), Continuize(), SklImpute(), RemoveNaNColumns()]) An ordered list of preprocessors applied to data before training or testing. """ __wraps__ = None __returns__ = SklModel _params = {} preprocessors = default_preprocessors = [ RemoveNaNClasses(), Continuize(), RemoveNaNColumns(), SklImpute()] @property def params(self): return self._params @params.setter def params(self, value): self._params = self._get_sklparams(value) def _get_sklparams(self, values): skllearner = self.__wraps__ if skllearner is not None: spec = inspect.getargs(skllearner.__init__.__code__) # first argument is 'self' assert spec.args[0] == "self" params = {name: values[name] for name in spec.args[1:] if name in values} else: raise TypeError("Wrapper does not define '__wraps__'") return params def preprocess(self, data): data = super().preprocess(data) if any(v.is_discrete and len(v.values) > 2 for v in data.domain.attributes): raise ValueError("Wrapped scikit-learn methods do not support " + "multinomial variables.") return data def __call__(self, data): m = super().__call__(data) m.used_vals = [np.unique(y) for y in data.Y[:, None].T] m.params = self.params return m def fit(self, X, Y, W=None): clf = self.__wraps__(**self.params) Y = Y.reshape(-1) if W is None or not self.supports_weights: return self.__returns__(clf.fit(X, Y)) return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1))) @property def supports_weights(self): """Indicates whether this learner supports weighted instances. """ return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames
class SoftmaxRegressionLearner(Learner): """L2 regularized softmax regression classifier. Uses the L-BFGS algorithm to minimize the categorical cross entropy cost with L2 regularization. This model is suitable when dealing with a multi-class classification problem. When using this learner you should: - choose a suitable regularization parameter lambda\_, - consider using many logistic regression models (one for each value of the class variable) instead of softmax regression. Parameters ---------- lambda\_ : float, optional (default=1.0) Regularization parameter. It controls trade-off between fitting the data and keeping parameters small. Higher values of lambda\_ force parameters to be smaller. preprocessors : list, optional (default=[RemoveNaNClasses(), RemoveNaNColumns(), Impute(), Continuize(), Normalize()]) Preprocessors are applied to data before training or testing. Default preprocessors: - remove columns with all values as NaN - replace NaN values with suitable values - continuize all discrete attributes, - transform the dataset so that the columns are on a similar scale, fmin_args : dict, optional Parameters for L-BFGS algorithm. """ name = 'softmax' preprocessors = [ RemoveNaNClasses(), RemoveNaNColumns(), Impute(), Continuize(), Normalize() ] def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args): super().__init__(preprocessors=preprocessors) self.lambda_ = lambda_ self.fmin_args = fmin_args def cost_grad(self, Theta_flat, X, Y): Theta = Theta_flat.reshape((self.num_classes, X.shape[1])) M = X.dot(Theta.T) P = np.exp(M - np.max(M, axis=1)[:, None]) P /= np.sum(P, axis=1)[:, None] cost = -np.sum(np.log(P) * Y) cost += self.lambda_ * Theta_flat.dot(Theta_flat) / 2.0 cost /= X.shape[0] grad = X.T.dot(P - Y).T grad += self.lambda_ * Theta grad /= X.shape[0] return cost, grad.ravel() def fit(self, X, y, W): if len(y.shape) > 1: raise ValueError('Softmax regression does not support ' 'multi-label classification') if np.isnan(np.sum(X)) or np.isnan(np.sum(y)): raise ValueError('Softmax regression does not support ' 'unknown values') X = np.hstack((X, np.ones((X.shape[0], 1)))) self.num_classes = np.unique(y).size Y = np.eye(self.num_classes)[y.ravel().astype(int)] theta = np.zeros(self.num_classes * X.shape[1]) theta, j, ret = fmin_l_bfgs_b(self.cost_grad, theta, args=(X, Y), **self.fmin_args) Theta = theta.reshape((self.num_classes, X.shape[1])) return SoftmaxRegressionModel(Theta)