Esempio n. 1
0
    def set_test_data(self, data):
        """
        Set the input separate testing dataset.
        """
        self.error(1)
        self.information(1)
        if data and not data.domain.class_var:
            self.error(1, "Test data input requires a class variable")
            data = None

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.information(1, "Test data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.warning(4)
        self.test_data_missing_vals = data is not None and \
                                      np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.warning(
                4,
                self._get_missing_data_warning(self.train_data_missing_vals,
                                               self.test_data_missing_vals))
            if data:
                data = RemoveNaNClasses(data)

        self.test_data = data
        if self.resampling == OWTestLearners.TestOnTest:
            self._invalidate()
Esempio n. 2
0
    def set_train_data(self, data):
        """
        Set the input training dataset.

        Parameters
        ----------
        data : Optional[Orange.data.Table]
        """
        self.Information.data_sampled.clear()
        self.Error.train_data_empty.clear()
        self.Error.class_required.clear()
        self.Error.too_many_classes.clear()
        self.Error.only_one_class_var_value.clear()
        if data is not None and not len(data):
            self.Error.train_data_empty()
            data = None
        if data:
            conds = [
                not data.domain.class_vars,
                len(data.domain.class_vars) > 1, data.domain.has_discrete_class
                and len(data.domain.class_var.values) == 1
            ]
            errors = [
                self.Error.class_required, self.Error.too_many_classes,
                self.Error.only_one_class_var_value
            ]
            for cond, error in zip(conds, errors):
                if cond:
                    error()
                    data = None
                    break

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.train_data_missing_vals = \
            data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = RemoveNaNClasses(data)
        else:
            self.Warning.missing_data.clear()

        self.data = data
        self.closeContext()
        self._update_scorers()
        self._update_controls()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain)
            if self.fold_feature_selected and bool(self.feature_model):
                self.resampling = OWTestLearners.FeatureFold
        self._invalidate()
Esempio n. 3
0
    def set_train_data(self, data):
        """
        Set the input training dataset.
        """
        self.error(0)
        self.information(0)
        if data and not data.domain.class_var:
            self.error(0, "Train data input requires a class variable")
            data = None

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.information(0, "Train data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.warning(4)
        self.train_data_missing_vals = data is not None and \
                                       np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.warning(4, self._get_missing_data_warning(
                self.train_data_missing_vals, self.test_data_missing_vals
            ))
            if data:
                data = RemoveNaNClasses(data)

        self.data = data
        self.closeContext()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain.class_var)
        self._invalidate()
Esempio n. 4
0
    def set_test_data(self, data):
        """
        Set the input separate testing dataset.
        """
        self.Information.test_data_sampled.clear()
        if data and not data.domain.class_var:
            self.Error.class_required()
            data = None
        else:
            self.Error.class_required_test.clear()

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.test_data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.test_data_missing_vals = \
            data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = RemoveNaNClasses(data)
        else:
            self.Warning.missing_data.clear()

        self.test_data = data
        if self.resampling == OWTestLearners.TestOnTest:
            self._invalidate()
Esempio n. 5
0
    def set_train_data(self, data):
        """
        Set the input training dataset.
        """
        self.Information.data_sampled.clear()
        if data and not data.domain.class_var:
            self.Error.class_required()
            data = None
        else:
            self.Error.class_required.clear()

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.train_data_missing_vals = \
            data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = RemoveNaNClasses(data)
        else:
            self.Warning.missing_data.clear()

        self.data = data
        self.closeContext()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain.class_var)
        self._invalidate()
Esempio n. 6
0
 def test_remove_nan_classes_multiclass(self):
     domain = Domain([DiscreteVariable("a", values="01")],
                     [DiscreteVariable("b", values="01"),
                     DiscreteVariable("c", values="01")])
     table = Table(domain, [[0, 1, np.nan],
                            [1, np.nan, 0],
                            [1, 0, 1],
                            [1, np.nan, np.nan]])
     table = RemoveNaNClasses(table)
     self.assertTrue(not np.isnan(table).any())
     self.assertEqual(table.domain, domain)
     self.assertEqual(len(table), 1)
Esempio n. 7
0
    def set_train_data(self, data):
        """
        Set the input training dataset.
        """
        self.Information.data_sampled.clear()
        self.Error.train_data_empty.clear()
        if data is not None and not len(data):
            self.Error.train_data_empty()
            data = None
        if data and not data.domain.class_vars:
            self.Error.class_required()
            data = None
        elif data and len(data.domain.class_vars) > 1:
            self.Error.too_many_classes()
            data = None
        else:
            self.Error.class_required.clear()
            self.Error.too_many_classes.clear()

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.train_data_missing_vals = \
            data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = RemoveNaNClasses(data)
        else:
            self.Warning.missing_data.clear()

        self.data = data
        self.closeContext()
        self._update_controls()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain)
            if self.fold_feature_selected and bool(self.feature_model):
                self.resampling = OWTestLearners.FeatureFold
        self._invalidate()
Esempio n. 8
0
    def commit(self):
        alpha = self.alphas[self.alpha_index]
        preprocessors = self.preprocessors
        if self.data is not None and np.isnan(self.data.Y).any():
            self.warning(0, "Missing values of target variable(s)")
            if not self.preprocessors:
                if self.reg_type == OWLinearRegression.OLS:
                    preprocessors = LinearRegressionLearner.preprocessors
                elif self.reg_type == OWLinearRegression.Ridge:
                    preprocessors = RidgeRegressionLearner.preprocessors
                else:
                    preprocessors = LassoRegressionLearner.preprocessors
            else:
                preprocessors = list(self.preprocessors)
            preprocessors.append(RemoveNaNClasses())
        args = {"preprocessors": preprocessors}
        if self.reg_type == OWLinearRegression.OLS:
            learner = LinearRegressionLearner(**args)
        elif self.reg_type == OWLinearRegression.Ridge:
            learner = RidgeRegressionLearner(alpha=alpha, **args)
        elif self.reg_type == OWLinearRegression.Lasso:
            learner = LassoRegressionLearner(alpha=alpha, **args)

        learner.name = self.learner_name
        predictor = None
        coef_table = None

        self.error(0)
        if self.data is not None:
            if not learner.check_learner_adequacy(self.data.domain):
                self.error(0, learner.learner_adequacy_err_msg)
            else:
                predictor = learner(self.data)
                predictor.name = self.learner_name
                domain = Domain(
                    [ContinuousVariable("coef", number_of_decimals=7)],
                    metas=[StringVariable("name")])
                coefs = [predictor.intercept] + list(predictor.coefficients)
                names = ["intercept"] + \
                    [attr.name for attr in predictor.domain.attributes]
                coef_table = Table(domain, list(zip(coefs, names)))
                coef_table.name = "coefficients"

        self.send("Linear Regression", learner)
        self.send("Model", predictor)
        self.send("Coefficients", coef_table)
Esempio n. 9
0
class TreeLearner(SklLearner):
    __wraps__ = skl_tree.DecisionTreeClassifier
    __returns__ = TreeClassifier
    name = 'tree'
    preprocessors = [
        RemoveNaNClasses(),
        RemoveNaNColumns(),
        SklImpute(),
        Continuize()
    ]

    def __init__(self,
                 criterion="gini",
                 splitter="best",
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 max_features=None,
                 random_state=None,
                 max_leaf_nodes=None,
                 preprocessors=None):
        super().__init__(preprocessors=preprocessors)
        self.params = vars()
Esempio n. 10
0
class SoftmaxLearner(Learner):
    """
    Implementation of softmax regression with k*(n+1) parameters
    trained using L-BFGS optimization.
    """
    name = 'softmax'
    preprocessors = [
        RemoveNaNClasses(),
        Normalize(),
        Continuize(),
        Impute(),
        RemoveNaNColumns()
    ]

    def __init__(self, preprocessors=None):
        super().__init__(preprocessors=preprocessors)

    def mysigma(self, x):
        """
        My softmax function. Always check that you provide correctly oriented data (ignore - solved with slicing).
        I subtracted max value to prevent overflow at calculation of exponent - it may cause undeflow, but that is
        not a problem.
        """
        tmpx = np.exp(x - np.max(x, axis=1)[:, None])
        return tmpx / np.sum(tmpx, axis=1)[:, None]

    def cost(self, theta, X, y):
        """
        Args:
            theta (np.ndarray): model parameters of shape [n_classes * n_features]
            X (np.ndarray): data of shape [n_examples, n_features]
            y (np.ndarray): target variable of shape [n_examples]

        Returns:
            float: The value of cost function evaluated with given parameters.
        """
        #################################################################################################
        # Theta pretvorim iz dolgega vektorja v matricno obliko, nato pripravim indikatorsko funkcijo
        #################################################################################################
        theta = theta.reshape((-1, X.shape[1]))
        indicator = np.identity(theta.shape[0])[y.astype(int)]
        return -(np.sum(indicator * np.log(self.mysigma(X.dot(theta.T)))))

    def grad(self, theta, X, y):
        """
        Args:
            theta (np.ndarray): model parameters of shape [n_classes * n_features]
            X (np.ndarray): data of shape [n_examples, n_features]
            y (np.ndarray): target variable of shape [n_examples]

        Returns:
            np.ndarray: Gradients wrt. all model's parameters of shape
                [n_classes * n_features]
        """
        theta = theta.reshape((-1, X.shape[1]))
        indicator = np.identity(theta.shape[0])[y.astype(int)]
        return -(X.T.dot(
            (indicator - self.mysigma(X.dot(theta.T))))).T.flatten()

    def approx_grad(self, theta, X, y, eps=1e-5):
        """
        Args:
            theta (np.ndarray): model parameters of shape [n_classes * n_features]
            X (np.ndarray): data of shape [n_examples, n_features]
            y (np.ndarray): target variable of shape [n_examples]
            eps (float): value offset for gradient estimation

        Returns:
            np.ndarray: Gradients wrt. all model's parameters of shape
                [n_classes * n_features]
        """
        result = []
        for i in range(len(theta)):
            crr = np.zeros(len(theta))
            crr[i] = 1
            result.append((self.cost(theta + (crr * eps), X, y) -
                           self.cost(theta - (crr * eps), X, y)) / (2 * eps))

        return np.array(result)

    def fit(self, X, y, W=None):
        """
        Args:
            X (np.ndarray): data of shape [n_examples, n_features]
            y (np.ndarray): target variable of shape [n_examples]
            W (np.ndarray): Orange weights - ignore for this exercise

        Returns:
            SoftmaxModel: Orange's classification model
        """
        num_classes = len(
            np.unique(y))  # predpostavljamo da so vsi razredi prisotni
        X = np.column_stack((np.ones(X.shape[0]), X))
        theta = np.ones(num_classes * X.shape[1]) * 1e-9
        result = fmin_l_bfgs_b(self.cost, theta, self.grad, args=(X, y))[0]
        return SoftmaxModel(result.reshape((-1, X.shape[1])))
Esempio n. 11
0
 def test_remove_nan_classes(self):
     table = Table("imports-85")
     self.assertTrue(np.isnan(table.Y).any())
     table = RemoveNaNClasses(table)
     self.assertTrue(not np.isnan(table.Y).any())
Esempio n. 12
0
class LinearRegressionLearner(Learner):
    '''L2 regularized linear regression (a.k.a Ridge regression)

    This model uses the L-BFGS algorithm to minimize the linear least
    squares penalty with L2 regularization. When using this model you
    should:

    - Choose a suitable regularization parameter lambda_
    - Consider appending a column of ones to the dataset (intercept term)

    Parameters
    ----------

    lambda\_ : float, optional (default=1.0)
        Regularization parameter. It controls trade-off between fitting the
        data and keeping parameters small. Higher values of lambda\_ force
        parameters to be smaller.

    preprocessors : list, optional (default="[Normalize(), Continuize(), Impute(), RemoveNaNColumns()])
        Preprocessors are applied to data before training or testing. Default preprocessors
        - transform the dataset so that the columns are on a similar scale,
        - continuize all discrete attributes,
        - remove columns with all values as NaN
        - replace NaN values with suitable values

    fmin_args : dict, optional
        Parameters for L-BFGS algorithm.
    """

    Examples
    --------

        import numpy as np
        from Orange.data import Table
        from Orange.regression.linear_bfgs import LinearRegressionLearner

        data = Table('housing')
        data.X = np.hstack((data.X, np.ones((data.X.shape[0], 1)))) # append ones
        m = LinearRegressionLearner(lambda_=1.0)
        c = m(data) # fit
        print(c(data)) # predict
    '''
    name = 'linear_bfgs'
    preprocessors = [
        RemoveNaNClasses(),
        Normalize(),
        Continuize(),
        Impute(),
        RemoveNaNColumns()
    ]

    def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args):

        super().__init__(preprocessors=preprocessors)
        self.lambda_ = lambda_
        self.fmin_args = fmin_args

    def cost_grad(self, theta, X, y):
        t = X.dot(theta) - y

        cost = t.dot(t)
        cost += self.lambda_ * theta.dot(theta)
        cost /= 2.0 * X.shape[0]

        grad = X.T.dot(t)
        grad += self.lambda_ * theta
        grad /= X.shape[0]

        return cost, grad

    def fit(self, X, Y, W):
        if len(Y.shape) > 1 and Y.shape[1] > 1:
            raise ValueError('Linear regression does not support '
                             'multi-target classification')

        if np.isnan(np.sum(X)) or np.isnan(np.sum(Y)):
            raise ValueError('Linear regression does not support '
                             'unknown values')

        theta = np.zeros(X.shape[1])
        theta, cost, ret = fmin_l_bfgs_b(self.cost_grad,
                                         theta,
                                         args=(X, Y.ravel()),
                                         **self.fmin_args)

        return LinearRegressionModel(theta)
Esempio n. 13
0
class SklLearner(_ReprableWithParams, Learner, metaclass=WrapperMeta):
    """
    ${skldoc}
    Additional Orange parameters

    preprocessors : list, optional (default=[RemoveNaNClasses(), Continuize(), SklImpute(), RemoveNaNColumns()])
        An ordered list of preprocessors applied to data before
        training or testing.
    """
    __wraps__ = None
    __returns__ = SklModel
    _params = {}

    preprocessors = default_preprocessors = [
        RemoveNaNClasses(),
        Continuize(),
        RemoveNaNColumns(),
        SklImpute()]

    @property
    def params(self):
        return self._params

    @params.setter
    def params(self, value):
        self._params = self._get_sklparams(value)

    def _get_sklparams(self, values):
        skllearner = self.__wraps__
        if skllearner is not None:
            spec = inspect.getargs(skllearner.__init__.__code__)
            # first argument is 'self'
            assert spec.args[0] == "self"
            params = {name: values[name] for name in spec.args[1:]
                      if name in values}
        else:
            raise TypeError("Wrapper does not define '__wraps__'")
        return params

    def preprocess(self, data):
        data = super().preprocess(data)

        if any(v.is_discrete and len(v.values) > 2
               for v in data.domain.attributes):
            raise ValueError("Wrapped scikit-learn methods do not support " +
                             "multinomial variables.")

        return data

    def __call__(self, data):
        m = super().__call__(data)
        m.used_vals = [np.unique(y) for y in data.Y[:, None].T]
        m.params = self.params
        return m

    def fit(self, X, Y, W=None):
        clf = self.__wraps__(**self.params)
        Y = Y.reshape(-1)
        if W is None or not self.supports_weights:
            return self.__returns__(clf.fit(X, Y))
        return self.__returns__(clf.fit(X, Y, sample_weight=W.reshape(-1)))

    @property
    def supports_weights(self):
        """Indicates whether this learner supports weighted instances.
        """
        return 'sample_weight' in self.__wraps__.fit.__code__.co_varnames
Esempio n. 14
0
class SoftmaxRegressionLearner(Learner):
    """L2 regularized softmax regression classifier.
    Uses the L-BFGS algorithm to minimize the categorical
    cross entropy cost with L2 regularization. This model is suitable
    when dealing with a multi-class classification problem.

    When using this learner you should:

    - choose a suitable regularization parameter lambda\_,
    - consider using many logistic regression models (one for each
      value of the class variable) instead of softmax regression.

    Parameters
    ----------

    lambda\_ : float, optional (default=1.0)
        Regularization parameter. It controls trade-off between fitting the
        data and keeping parameters small. Higher values of lambda\_ force
        parameters to be smaller.

    preprocessors : list, optional (default=[RemoveNaNClasses(), RemoveNaNColumns(), Impute(), Continuize(), Normalize()])
        Preprocessors are applied to data before training or testing. Default preprocessors:

        - remove columns with all values as NaN
        - replace NaN values with suitable values
        - continuize all discrete attributes,
        - transform the dataset so that the columns are on a similar scale,

    fmin_args : dict, optional
        Parameters for L-BFGS algorithm.
    """
    name = 'softmax'
    preprocessors = [
        RemoveNaNClasses(),
        RemoveNaNColumns(),
        Impute(),
        Continuize(),
        Normalize()
    ]

    def __init__(self, lambda_=1.0, preprocessors=None, **fmin_args):
        super().__init__(preprocessors=preprocessors)
        self.lambda_ = lambda_
        self.fmin_args = fmin_args

    def cost_grad(self, Theta_flat, X, Y):
        Theta = Theta_flat.reshape((self.num_classes, X.shape[1]))

        M = X.dot(Theta.T)
        P = np.exp(M - np.max(M, axis=1)[:, None])
        P /= np.sum(P, axis=1)[:, None]

        cost = -np.sum(np.log(P) * Y)
        cost += self.lambda_ * Theta_flat.dot(Theta_flat) / 2.0
        cost /= X.shape[0]

        grad = X.T.dot(P - Y).T
        grad += self.lambda_ * Theta
        grad /= X.shape[0]

        return cost, grad.ravel()

    def fit(self, X, y, W):
        if len(y.shape) > 1:
            raise ValueError('Softmax regression does not support '
                             'multi-label classification')

        if np.isnan(np.sum(X)) or np.isnan(np.sum(y)):
            raise ValueError('Softmax regression does not support '
                             'unknown values')

        X = np.hstack((X, np.ones((X.shape[0], 1))))

        self.num_classes = np.unique(y).size
        Y = np.eye(self.num_classes)[y.ravel().astype(int)]

        theta = np.zeros(self.num_classes * X.shape[1])
        theta, j, ret = fmin_l_bfgs_b(self.cost_grad,
                                      theta,
                                      args=(X, Y),
                                      **self.fmin_args)
        Theta = theta.reshape((self.num_classes, X.shape[1]))

        return SoftmaxRegressionModel(Theta)