Example #1
0
    def _launch_svc(self, kernel_train, x_test, y_train, y_test, c):

        if self._algorithm_params['balanced']:
            svc = OneVsOneClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6,
                                         class_weight='balanced'))
        else:
            svc = OneVsOneClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6))

        svc.fit(kernel_train, y_train)
        y_hat_train = svc.predict(kernel_train)
        y_hat = svc.predict(x_test)
        proba_test = svc.predict_proba(x_test)[:, 1]

        return svc, y_hat, y_hat_train
Example #2
0
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
    """Gaussian process classification (GPC) based on Laplace approximation.

    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
    Gaussian Processes for Machine Learning (GPML) by Rasmussen and
    Williams.

    Internally, the Laplace approximation is used for approximating the
    non-Gaussian posterior by a Gaussian.

    Currently, the implementation is restricted to using the logistic link
    function. For multi-class classification, several binary one-versus rest
    classifiers are fitted. Note that this class thus does not implement
    a true multi-class Laplace approximation.

    Parameters
    ----------
    kernel : kernel object
        The kernel specifying the covariance function of the GP. If None is
        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
        the kernel's hyperparameters are optimized during fitting.

    optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
        Can either be one of the internally supported optimizers for optimizing
        the kernel's parameters, specified by a string, or an externally
        defined optimizer passed as a callable. If a callable is passed, it
        must have the  signature::

            def optimizer(obj_func, initial_theta, bounds):
                # * 'obj_func' is the objective function to be maximized, which
                #   takes the hyperparameters theta as parameter and an
                #   optional flag eval_gradient, which determines if the
                #   gradient is returned additionally to the function value
                # * 'initial_theta': the initial value for theta, which can be
                #   used by local optimizers
                # * 'bounds': the bounds on the values of theta
                ....
                # Returned are the best found hyperparameters theta and
                # the corresponding value of the target function.
                return theta_opt, func_min

        Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize
        is used. If None is passed, the kernel's parameters are kept fixed.
        Available internal optimizers are::

            'fmin_l_bfgs_b'

    n_restarts_optimizer: int, optional (default: 0)
        The number of restarts of the optimizer for finding the kernel's
        parameters which maximize the log-marginal likelihood. The first run
        of the optimizer is performed from the kernel's initial parameters,
        the remaining ones (if any) from thetas sampled log-uniform randomly
        from the space of allowed theta-values. If greater than 0, all bounds
        must be finite. Note that n_restarts_optimizer=0 implies that one
        run is performed.

    max_iter_predict: int, optional (default: 100)
        The maximum number of iterations in Newton's method for approximating
        the posterior during predict. Smaller values will reduce computation
        time at the cost of worse results.

    warm_start : bool, optional (default: False)
        If warm-starts are enabled, the solution of the last Newton iteration
        on the Laplace approximation of the posterior mode is used as
        initialization for the next call of _posterior_mode(). This can speed
        up convergence when _posterior_mode is called several times on similar
        problems as in hyperparameter optimization.

    copy_X_train : bool, optional (default: True)
        If True, a persistent copy of the training data is stored in the
        object. Otherwise, just a reference to the training data is stored,
        which might cause predictions to change if the data is modified
        externally.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    multi_class: string, default: "one_vs_rest"
        Specifies how multi-class classification problems are handled.
        Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest",
        one binary Gaussian process classifier is fitted for each class, which
        is trained to separate this class from the rest. In "one_vs_one", one
        binary Gaussian process classifier is fitted for each pair of classes,
        which is trained to separate these two classes. The predictions of
        these binary predictors are combined into multi-class predictions.
        Note that "one_vs_one" does not support predicting probability
        estimates.

    n_jobs : int, optional, default: 1
        The number of jobs to use for the computation. If -1 all CPUs are used.
        If 1 is given, no parallel computing code is used at all, which is
        useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
        used. Thus for n_jobs = -2, all CPUs but one are used.

    Attributes
    ----------
    kernel_ : kernel object
        The kernel used for prediction. In case of binary classification,
        the structure of the kernel is the same as the one passed as parameter
        but with optimized hyperparameters. In case of multi-class
        classification, a CompoundKernel is returned which consists of the
        different kernels used in the one-versus-rest classifiers.

    log_marginal_likelihood_value_: float
        The log-marginal-likelihood of ``self.kernel_.theta``

    classes_ : array-like, shape = (n_classes,)
        Unique class labels.

    n_classes_ : int
        The number of classes in the training data
    """
    def __init__(self,
                 kernel=None,
                 optimizer="fmin_l_bfgs_b",
                 n_restarts_optimizer=0,
                 max_iter_predict=100,
                 warm_start=False,
                 copy_X_train=True,
                 random_state=None,
                 multi_class="one_vs_rest",
                 n_jobs=1):
        self.kernel = kernel
        self.optimizer = optimizer
        self.n_restarts_optimizer = n_restarts_optimizer
        self.max_iter_predict = max_iter_predict
        self.warm_start = warm_start
        self.copy_X_train = copy_X_train
        self.random_state = random_state
        self.multi_class = multi_class
        self.n_jobs = n_jobs

    def fit(self, X, y):
        """Fit Gaussian process classification model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y, multi_output=False)

        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
            self.kernel, self.optimizer, self.n_restarts_optimizer,
            self.max_iter_predict, self.warm_start, self.copy_X_train,
            self.random_state)

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.size
        if self.n_classes_ == 1:
            raise ValueError("GaussianProcessClassifier requires 2 or more "
                             "distinct classes. Only class %s present." %
                             self.classes_[0])
        if self.n_classes_ > 2:
            if self.multi_class == "one_vs_rest":
                self.base_estimator_ = \
                    OneVsRestClassifier(self.base_estimator_,
                                        n_jobs=self.n_jobs)
            elif self.multi_class == "one_vs_one":
                self.base_estimator_ = \
                    OneVsOneClassifier(self.base_estimator_,
                                       n_jobs=self.n_jobs)
            else:
                raise ValueError("Unknown multi-class mode %s" %
                                 self.multi_class)

        self.base_estimator_.fit(X, y)

        if self.n_classes_ > 2:
            self.log_marginal_likelihood_value_ = np.mean([
                estimator.log_marginal_likelihood()
                for estimator in self.base_estimator_.estimators_
            ])
        else:
            self.log_marginal_likelihood_value_ = \
                self.base_estimator_.log_marginal_likelihood()

        return self

    def predict(self, X):
        """Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array, shape = (n_samples,)
            Predicted target values for X, values are from ``classes_``
        """
        check_is_fitted(self, ["classes_", "n_classes_"])
        X = check_array(X)
        return self.base_estimator_.predict(X)

    def predict_proba(self, X):
        """Return probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array-like, shape = (n_samples, n_classes)
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        check_is_fitted(self, ["classes_", "n_classes_"])
        if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
            raise ValueError("one_vs_one multi-class mode does not support "
                             "predicting probability estimates. Use "
                             "one_vs_rest mode instead.")
        X = check_array(X)
        return self.base_estimator_.predict_proba(X)

    @property
    def kernel_(self):
        if self.n_classes_ == 2:
            return self.base_estimator_.kernel_
        else:
            return CompoundKernel([
                estimator.kernel_
                for estimator in self.base_estimator_.estimators_
            ])

    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
        """Returns log-marginal likelihood of theta for training data.

        In the case of multi-class classification, the mean log-marginal
        likelihood of the one-versus-rest classifiers are returned.

        Parameters
        ----------
        theta : array-like, shape = (n_kernel_params,) or none
            Kernel hyperparameters for which the log-marginal likelihood is
            evaluated. In the case of multi-class classification, theta may
            be the  hyperparameters of the compound kernel or of an individual
            kernel. In the latter case, all individual kernel get assigned the
            same theta values. If None, the precomputed log_marginal_likelihood
            of ``self.kernel_.theta`` is returned.

        eval_gradient : bool, default: False
            If True, the gradient of the log-marginal likelihood with respect
            to the kernel hyperparameters at position theta is returned
            additionally. Note that gradient computation is not supported
            for non-binary classification. If True, theta must not be None.

        Returns
        -------
        log_likelihood : float
            Log-marginal likelihood of theta for training data.

        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
            Gradient of the log-marginal likelihood with respect to the kernel
            hyperparameters at position theta.
            Only returned when eval_gradient is True.
        """
        check_is_fitted(self, ["classes_", "n_classes_"])

        if theta is None:
            if eval_gradient:
                raise ValueError(
                    "Gradient can only be evaluated for theta!=None")
            return self.log_marginal_likelihood_value_

        theta = np.asarray(theta)
        if self.n_classes_ == 2:
            return self.base_estimator_.log_marginal_likelihood(
                theta, eval_gradient)
        else:
            if eval_gradient:
                raise NotImplementedError(
                    "Gradient of log-marginal-likelhood not implemented for "
                    "multi-class GPC.")
            estimators = self.base_estimator_.estimators_
            n_dims = estimators[0].kernel_.n_dims
            if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
                return np.mean([
                    estimator.log_marginal_likelihood(theta)
                    for i, estimator in enumerate(estimators)
                ])
            elif theta.shape[0] == n_dims * self.classes_.shape[0]:
                # theta for compound kernel
                return np.mean([
                    estimator.log_marginal_likelihood(theta[n_dims * i:n_dims *
                                                            (i + 1)])
                    for i, estimator in enumerate(estimators)
                ])
            else:
                raise ValueError(
                    "Shape of theta must be either %d or %d. "
                    "Obtained theta with shape %d." %
                    (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))
Example #3
0
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin):
    """Gaussian process classification (GPC) based on Laplace approximation.

    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
    Gaussian Processes for Machine Learning (GPML) by Rasmussen and
    Williams.

    Internally, the Laplace approximation is used for approximating the
    non-Gaussian posterior by a Gaussian.

    Currently, the implementation is restricted to using the logistic link
    function. For multi-class classification, several binary one-versus rest
    classifiers are fitted. Note that this class thus does not implement
    a true multi-class Laplace approximation.

    Parameters
    ----------
    kernel : kernel object
        The kernel specifying the covariance function of the GP. If None is
        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
        the kernel's hyperparameters are optimized during fitting.

    optimizer : string or callable, optional (default: "fmin_l_bfgs_b")
        Can either be one of the internally supported optimizers for optimizing
        the kernel's parameters, specified by a string, or an externally
        defined optimizer passed as a callable. If a callable is passed, it
        must have the  signature::

            def optimizer(obj_func, initial_theta, bounds):
                # * 'obj_func' is the objective function to be maximized, which
                #   takes the hyperparameters theta as parameter and an
                #   optional flag eval_gradient, which determines if the
                #   gradient is returned additionally to the function value
                # * 'initial_theta': the initial value for theta, which can be
                #   used by local optimizers
                # * 'bounds': the bounds on the values of theta
                ....
                # Returned are the best found hyperparameters theta and
                # the corresponding value of the target function.
                return theta_opt, func_min

        Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize
        is used. If None is passed, the kernel's parameters are kept fixed.
        Available internal optimizers are::

            'fmin_l_bfgs_b'

    n_restarts_optimizer : int, optional (default: 0)
        The number of restarts of the optimizer for finding the kernel's
        parameters which maximize the log-marginal likelihood. The first run
        of the optimizer is performed from the kernel's initial parameters,
        the remaining ones (if any) from thetas sampled log-uniform randomly
        from the space of allowed theta-values. If greater than 0, all bounds
        must be finite. Note that n_restarts_optimizer=0 implies that one
        run is performed.

    max_iter_predict : int, optional (default: 100)
        The maximum number of iterations in Newton's method for approximating
        the posterior during predict. Smaller values will reduce computation
        time at the cost of worse results.

    warm_start : bool, optional (default: False)
        If warm-starts are enabled, the solution of the last Newton iteration
        on the Laplace approximation of the posterior mode is used as
        initialization for the next call of _posterior_mode(). This can speed
        up convergence when _posterior_mode is called several times on similar
        problems as in hyperparameter optimization.

    copy_X_train : bool, optional (default: True)
        If True, a persistent copy of the training data is stored in the
        object. Otherwise, just a reference to the training data is stored,
        which might cause predictions to change if the data is modified
        externally.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    multi_class: string, default : "one_vs_rest"
        Specifies how multi-class classification problems are handled.
        Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest",
        one binary Gaussian process classifier is fitted for each class, which
        is trained to separate this class from the rest. In "one_vs_one", one
        binary Gaussian process classifier is fitted for each pair of classes,
        which is trained to separate these two classes. The predictions of
        these binary predictors are combined into multi-class predictions.
        Note that "one_vs_one" does not support predicting probability
        estimates.

    n_jobs : int, optional, default: 1
        The number of jobs to use for the computation. If -1 all CPUs are used.
        If 1 is given, no parallel computing code is used at all, which is
        useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are
        used. Thus for n_jobs = -2, all CPUs but one are used.

    Attributes
    ----------
    kernel_ : kernel object
        The kernel used for prediction. In case of binary classification,
        the structure of the kernel is the same as the one passed as parameter
        but with optimized hyperparameters. In case of multi-class
        classification, a CompoundKernel is returned which consists of the
        different kernels used in the one-versus-rest classifiers.

    log_marginal_likelihood_value_ : float
        The log-marginal-likelihood of ``self.kernel_.theta``

    classes_ : array-like, shape = (n_classes,)
        Unique class labels.

    n_classes_ : int
        The number of classes in the training data

    .. versionadded:: 0.18
    """
    def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
                 n_restarts_optimizer=0, max_iter_predict=100,
                 warm_start=False, copy_X_train=True, random_state=None,
                 multi_class="one_vs_rest", n_jobs=1):
        self.kernel = kernel
        self.optimizer = optimizer
        self.n_restarts_optimizer = n_restarts_optimizer
        self.max_iter_predict = max_iter_predict
        self.warm_start = warm_start
        self.copy_X_train = copy_X_train
        self.random_state = random_state
        self.multi_class = multi_class
        self.n_jobs = n_jobs

    def fit(self, X, y):
        """Fit Gaussian process classification model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y, multi_output=False)

        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
            self.kernel, self.optimizer, self.n_restarts_optimizer,
            self.max_iter_predict, self.warm_start, self.copy_X_train,
            self.random_state)

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.size
        if self.n_classes_ == 1:
            raise ValueError("GaussianProcessClassifier requires 2 or more "
                             "distinct classes. Only class %s present."
                             % self.classes_[0])
        if self.n_classes_ > 2:
            if self.multi_class == "one_vs_rest":
                self.base_estimator_ = \
                    OneVsRestClassifier(self.base_estimator_,
                                        n_jobs=self.n_jobs)
            elif self.multi_class == "one_vs_one":
                self.base_estimator_ = \
                    OneVsOneClassifier(self.base_estimator_,
                                       n_jobs=self.n_jobs)
            else:
                raise ValueError("Unknown multi-class mode %s"
                                 % self.multi_class)

        self.base_estimator_.fit(X, y)

        if self.n_classes_ > 2:
            self.log_marginal_likelihood_value_ = np.mean(
                [estimator.log_marginal_likelihood()
                 for estimator in self.base_estimator_.estimators_])
        else:
            self.log_marginal_likelihood_value_ = \
                self.base_estimator_.log_marginal_likelihood()

        return self

    def predict(self, X):
        """Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array, shape = (n_samples,)
            Predicted target values for X, values are from ``classes_``
        """
        check_is_fitted(self, ["classes_", "n_classes_"])
        X = check_array(X)
        return self.base_estimator_.predict(X)

    def predict_proba(self, X):
        """Return probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array-like, shape = (n_samples, n_classes)
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        check_is_fitted(self, ["classes_", "n_classes_"])
        if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
            raise ValueError("one_vs_one multi-class mode does not support "
                             "predicting probability estimates. Use "
                             "one_vs_rest mode instead.")
        X = check_array(X)
        return self.base_estimator_.predict_proba(X)

    @property
    def kernel_(self):
        if self.n_classes_ == 2:
            return self.base_estimator_.kernel_
        else:
            return CompoundKernel(
                [estimator.kernel_
                 for estimator in self.base_estimator_.estimators_])

    def log_marginal_likelihood(self, theta=None, eval_gradient=False):
        """Returns log-marginal likelihood of theta for training data.

        In the case of multi-class classification, the mean log-marginal
        likelihood of the one-versus-rest classifiers are returned.

        Parameters
        ----------
        theta : array-like, shape = (n_kernel_params,) or none
            Kernel hyperparameters for which the log-marginal likelihood is
            evaluated. In the case of multi-class classification, theta may
            be the  hyperparameters of the compound kernel or of an individual
            kernel. In the latter case, all individual kernel get assigned the
            same theta values. If None, the precomputed log_marginal_likelihood
            of ``self.kernel_.theta`` is returned.

        eval_gradient : bool, default: False
            If True, the gradient of the log-marginal likelihood with respect
            to the kernel hyperparameters at position theta is returned
            additionally. Note that gradient computation is not supported
            for non-binary classification. If True, theta must not be None.

        Returns
        -------
        log_likelihood : float
            Log-marginal likelihood of theta for training data.

        log_likelihood_gradient : array, shape = (n_kernel_params,), optional
            Gradient of the log-marginal likelihood with respect to the kernel
            hyperparameters at position theta.
            Only returned when eval_gradient is True.
        """
        check_is_fitted(self, ["classes_", "n_classes_"])

        if theta is None:
            if eval_gradient:
                raise ValueError(
                    "Gradient can only be evaluated for theta!=None")
            return self.log_marginal_likelihood_value_

        theta = np.asarray(theta)
        if self.n_classes_ == 2:
            return self.base_estimator_.log_marginal_likelihood(
                theta, eval_gradient)
        else:
            if eval_gradient:
                raise NotImplementedError(
                    "Gradient of log-marginal-likelihood not implemented for "
                    "multi-class GPC.")
            estimators = self.base_estimator_.estimators_
            n_dims = estimators[0].kernel_.n_dims
            if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
                return np.mean(
                    [estimator.log_marginal_likelihood(theta)
                     for i, estimator in enumerate(estimators)])
            elif theta.shape[0] == n_dims * self.classes_.shape[0]:
                # theta for compound kernel
                return np.mean(
                    [estimator.log_marginal_likelihood(
                        theta[n_dims * i:n_dims * (i + 1)])
                     for i, estimator in enumerate(estimators)])
            else:
                raise ValueError("Shape of theta must be either %d or %d. "
                                 "Obtained theta with shape %d."
                                 % (n_dims, n_dims * self.classes_.shape[0],
                                    theta.shape[0]))
Example #4
0
                                                               size=0.2,
                                                               seed=123)

# Training the model
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsOneClassifier  # Lets us treat each col of y independently
clf = OneVsOneClassifier(
    LogisticRegression())  # Fits a sep classifier for each of the cols
clf.fit(X_train, y_train)

# C - Making predictions
# Predicting on holdout data
holdout = pd.read_csv('HoldoutData.csv', index_col=0)
holdout = holdout[NUMERIC_COLUMNS].fillna(
    -1000)  # Select just numeric columns and replace NaNs
predictions = clf.predict_proba(
    holdout)  # Predicts probabilities for each label
# If .predict() was used - output would be 0 or 1
# Log loss penalises for being confident and wrong
# As a result, there would be a worse performance compared to .predict_proba()

# Submitting your predictions as a csv
# Submission - df with column headers and row with probabilities for each column
# All formatting can be done with the pandas to_csv function
# Cols have orig column name separated from value by two '_' (some already contained '_')

# Prediction - array of values, needs to be converted to a df

prediction_df = pd.DataFrame(
    columns=pd.get_dummies(
        df[LABEELS],
        prefix_sep='_').columns,  # Separates orig col names from col values
Example #5
0
class log_kernel_MOM(BaseEstimator):
    ''' Logistic Regression Kernel MOM
    
    Kernel logarithmic regression MOM risk minimization using IRLS with regularization L2
    
    Parameters
    ----------

    K : int, default 10
        number of blocks for the computation of the MOM. A big value of K deals with more outliers but small values of K are better for the performance when there are no outliers.
        
    eta0 : float, default 1
        step size parameter, the step size is defined as the i-th iteration by 1/(1+eta0*i).

    beta : float, default 1
        L2 regularization parameter.

    epoch : int, default 200
        number of iterations before the end of the algorithm.

    kernel : {'rbf','poly', callable function}, default 'rbf'
        kernel used in the algorithm. A callable function can be given, it should take as entry two matrices X1, X2 and return the pairwise kernel distance matrix 

    gamma : float, default 1/n_features
        coefficient used if the kernel is 'rbf' in which case the kernel function is exp(-gamma*x^2)

    degree : int, default 3
        degree of the polynomial if the kernel is 'poly'

    agg : int, default 1
        number of runs of the algorithm on which we aggregate. One might want to decrease this number if the complexity is a problem.

    verbose : boolean, default True
        display a message at the end of each run if agg > 1.

    progress : boolean, default False
        display a progress bar to monitor the algorithm on each run (agg > 1 means several progress bar).

    compter : boolean, default False
        used for outlier detection, if compter=True, the number of time each point is used in the algorithm will be recorded in the attribute "counts".

    multi : {'ovr','ovo'} , default 'ovr'
        method used to go from binary classification to multiclass classification. 'ovr' means "one vs the rest" and 'ovo' means "one vs one" .
        
    Attributes
    ----------
    
    alpha : array like, length = n_sample
        alpha is updated in the algorithm, provides with the final coefficients of the decision function.

    counts : array like, length = n_sampled
        the i-th element record the number of time the i-th element of the training dataset X has been used. Only if compter=True.

    Methods
    -------

    fit(X,y) : fit the model
        X : numpy matrix size = (n_samples,n_features)
        y : array like, length = n_samples


    predict(X) : predict the class of the points in X
        X : numpy matrix size = (n_samples,n_features)
        returns array-like, length = n_samples.

    predict_proba(X) : predict the probability that each point belong to each class.
        X : numpy matrox size = (n_samples,n_features)
        returns matrix, size = (n_samples,n_class)
        
    '''
    def __init__(self,
                 K=10,
                 eta0=1,
                 beta=1,
                 epoch=200,
                 kernel='rbf',
                 gamma=None,
                 degree=3,
                 agg=1,
                 verbose=True,
                 progress=False,
                 compter=False,
                 multi='ovr',
                 augmenter=1,
                 power=2 / 3):

        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")
        for arg, val in values.items():
            setattr(self, arg, val)
        binary_clf = log_kernel_MOM_binary(K, eta0, beta, epoch, gamma, degree,
                                           agg, verbose, progress, compter,
                                           power)
        if multi == "ovr":
            self.clf = OneVsRestClassifier(binary_clf)
        elif multi == "ovo":
            self.clf = OneVsOneClassifier(binary_clf)
        else:
            raise NameError('Multiclass meta-algorithm not known')

    def fit(self, X, y):
        self.X = X
        perm = np.array([])
        if (self.kernel == 'poly'):
            kfunc = lambda x, y: polynomial_kernel(
                x, y, degree=self.degree, gamma=self.gamma)
        elif (self.kernel == 'rbf'):
            kfunc = lambda x, y: rbf_kernel(x, y, self.gamma)
        else:
            kfunc = self.kernel
        Kernel = kfunc(np.array(X), np.array(X))

        for f in range(self.augmenter):
            perm = np.hstack([perm, np.random.permutation(len(X))])
        self.perm = perm.astype(np.int64)

        self.clf.fit(Kernel[self.perm][:, self.perm], y[self.perm])
        return self

    def predict(self, xtest):

        if (self.kernel == 'poly'):
            kfunc = lambda x, y: polynomial_kernel(
                x, y, degree=self.degree, gamma=self.gamma)
        elif (self.kernel == 'rbf'):
            kfunc = lambda x, y: rbf_kernel(x, y, self.gamma)
        else:
            kfunc = self.kernel
        KC = kfunc(xtest, self.X[self.perm])
        return self.clf.predict(KC)

    def predict_proba(xtest):

        if (self.kernel == 'poly'):
            kfunc = lambda x, y: polynomial_kernel(
                x, y, degree=self.degree, gamma=self.gamma)
        elif (self.kernel == 'rbf'):
            kfunc = lambda x, y: rbf_kernel(x, y, self.gamma)
        else:
            kfunc = self.kernel
        KC = kfunc(xtest, self.X[self.perm])
        return self.clf.predict_proba(KC)

    def score(self, X, y):
        return np.mean(self.predict(X) == y)

    def set_params(self, **params):
        self.__init__(**params)
        return self
Example #6
0
class ClassifierRunner():
    def __init__(self,
                 pipeline,
                 clf,
                 clf_name,
                 example_indices,
                 selector_params=None,
                 multiclass=None,
                 compare_classifiers=None):
        self.pipeline = pipeline
        self.clf = clf
        self.clf_name = clf_name
        self.example_indices = example_indices
        self.selector_params = selector_params
        self.selector_param_str = self.selector_params['combine'] + str(
            self.selector_params['threshold'])
        self.multiclass = multiclass
        self.compare_classifiers = compare_classifiers

    def fit(self, X_train_tranformed, y_train, num_classes):
        self.num_classes = num_classes
        self.num_features = len(X_train_tranformed[0])

        if hasattr(self.clf, 'reset'):  # reset classifier from sklearn library
            self.clf = self.clf.reset()
        else:
            self.clf = base.clone(self.clf)
            if self.num_classes > 2:
                if self.multiclass == 'ovr':
                    self.clf = OneVsRestClassifier(self.clf)
                else:
                    self.clf = OneVsOneClassifier(self.clf)
        self.clf.fit(X_train_tranformed, y_train)

    def run_prediction(self, X_test_transformed, y_test):
        # get predictions
        y_pred = self.clf.predict(X_test_transformed)

        self.proba = None
        if (self.num_classes == 2 or self.multiclass
                == 'ovr') and self.clf.__class__.__name__ != 'SVC':
            self.proba = self.clf.predict_proba(X_test_transformed)

        # get ids of examples that were misclassified
        if self.compare_classifiers == 'mcnemar':
            for i in range(len(self.y_pred)):
                if y_pred[i] != y_test[i]:
                    self.pipeline.misclassified_map[self.clf_name].append(
                        self.example_indices[i])

        self.write_predictions(y_pred)

        self.write_metrics(y_test, y_pred)

    def write_predictions(self, y_pred):
        if self.proba is not None and y_pred is not None:
            for i in range(len(y_pred)):
                self.pipeline.prediction_scores[self.selector_param_str][
                    self.clf_name][self.example_indices[i]] = self.proba[i]
                self.pipeline.predictions[self.selector_param_str][
                    self.clf_name][self.example_indices[i]] = y_pred[i]

    def write_metrics(self, y_test, y_pred):
        average = 'binary' if self.num_classes > 2 else 'micro'
        auc_score = []
        if self.proba is not None:
            if average == 'binary':
                auc_score = roc_auc_score(y_test, y_pred)
            else:
                y_bin = label_binarize(y_test, classes=range(self.num_classes))
                for i in range(self.num_classes):
                    y_temp = y_bin[:, i]
                    auc_score.append(roc_auc_score(y_temp, self.proba[:, i]))
                auc_score = str(auc_score)

        self.pipeline.results = self.pipeline.results.append(
            {
                'base clf': self.clf_name,
                'num features': self.num_features,
                'accuracy': accuracy_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred, average=average),
                'recall': recall_score(y_test, y_pred, average=average),
                'auc': auc_score if auc_score is not None else -1,
                'f1': f1_score(y_test, y_pred, average=average)
            },
            ignore_index=True)

        if self.selector_params is not None:
            for k in self.selector_params.keys():
                self.pipeline.results.loc[len(results) - 1,
                                          k] = self.selector_params[k]
Example #7
0
class perceptronMOM(BaseEstimator):

    '''Perceptron MOM classifier.
    Perceptron MOM risk minimization. The Perceptron minimize the perceptron loss using SGD without regularization.
    
    Parameters
    ----------

    w0 : array-like, length = n_features + 1, default ones(n_features + 1)
        initial coefficients (including the intercept) of the classifier.

    K : int, default 10
        number of blocks for the computation of the MOM. A big value of K deals with more outliers but small values of K are better for the performance when there are no outliers.
        
    eta0 : float, default 1
        step size parameter, the step size is defined as the i-th iteration by 1/(1+eta0*i).

    epoch : int, default 200
        number of iterations before the end of the algorithm.

    mu : float between 0 and 1, default 0.95
        coefficient in the momentum.

    agg : int, default 1
        number of runs of the algorithm on which we aggregate. One might want to decrease this number if the complexity is a problem.

    compter : boolean, default False
        used for outlier detection, if compter=True, the number of time each point is used in the algorithm will be recorded in the attribute "counts".

    progress : boolean, default False
        display a progress bar to monitor the algorithm on each run (agg > 1 means several progress bar).

    verbose : boolean, default True
        display a message at the end of each run if agg > 1.

    multi : {'ovr','ovo'} , default 'ovr'
        method used to go from binary classification to multiclass classification. 'ovr' means "one vs the rest" and 'ovo' means "one vs one" .
        
    Attributes
    ----------
    
    w0 : array like, length = n_features + 1
        w0 is updated in the algorithm, provides with the final coefficients of the decision function.

    counts : array like, length = n_sampled
        the i-th element record the number of time the i-th element of the training dataset X has been used. Only if compter=True.

    Methods
    -------

    fit(X,y) : fit the model
        X : numpy matrix size = (n_samples,n_features)
        y : array like, length = n_samples


    predict(X) : predict the class of the points in X
        X : numpy matrix size = (n_samples,n_features)
        returns array-like, length = n_samples.

    predict_proba(X) : predict the probability that each point belong to each class.
        X : numpy matrox size = (n_samples,n_features)
        returns matrix, size = (n_samples,n_class)
        
    '''

    def __init__( self,w0=None,K=10,eta0=1,epoch=100,mu=0.95,agg=1,compter=False,progress=False, verbose = True, multi='ovr'):
        binary_clf=perceptronMOM_binary(w0,K,eta0,epoch,mu,agg,compter,progress,verbose)
        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")
        for arg, val in values.items():
            setattr(self, arg, val)
        if multi=="ovr":
            self.clf=OneVsRestClassifier(binary_clf)
        elif multi=="ovo":
            self.clf=OneVsOneClassifier(binary_clf)
        else:
            raise NameError('Multiclass meta-algorithm not known')
    def fit(self,X,y):
        self.clf.fit(X,y)
        return self
    def predict(self,X):
        return self.clf.predict(X)
    def predict_proba(self,X):
        return self.clf.predict_proba(X)
    def score(self,X,y):
        return np.mean(self.predict(X)==y)
    def set_params(self,**params):
        self.__init__(**params)
        return self
Example #8
0
class SeCoEstimator(BaseEstimator, ClassifierMixin):
    """A classifier using rules learned with the *Separate-and-Conquer* (SeCo)
    algorithm, also known as *Covering* algorithm.

    Wraps `_BaseSeCoEstimator` to handle multi-class problems, selecting a
    multi-class strategy and making sure that `_BaseSeCoEstimator` always sees
    an integer range [0..n_classes_) of class labels, where 0 is the intended
    fallback class; i.e. the biggest class in multi-class problems, or the
    negative class when learning a binary concept.

    The concrete SeCo variant to run is defined by `algorithm_config`.

    Fields
    -----
    algorithm_config : subclass of SeCoAlgorithmConfiguration
        Defines the concrete SeCo algorithm to run, see
        :class:`SeCoAlgorithmConfiguration`.

    Parameters
    -----
    multi_class : callable or str or None
        Which strategy to use for non-binary problems. Possible values:

        - None: auto-select; use 'direct' if possible
          (`algorithm_config.direct_multiclass_support()` returns True),
          'one_vs_rest' otherwise.
        - A callable: Construct
          `self.base_estimator_ = multi_class(_BaseSeCoEstimator())` and
          delegate to that estimator. Useful if you want to roll a different
          binarization strategy, e.g.

          >>> import sklearn.multiclass, functools
          >>> multi_class=functools.partial(
          ...     sklearn.multiclass.OutputCodeClassifier,
          ...     code_size=0.7, random_state=42)

          If you use this, make sure to pass to `_BaseSeCoEstimator` classes `y`
          from an integer range [0..n_classes_), e.g. using `LabelEncoder`.
          Also be aware of class order influence on tie-breaking.
        - 'direct': Directly learn a theory of rules with different heads
          (target classes). Uses :class:`BySizeLabelEncoder` internally.
        - 'one_vs_rest': Use `sklearn.multiclass.OneVsRestClassifier` for class
          binarization and learn binary theories.
        - 'one_vs_one': Use `sklearn.multiclass.OneVsOneClassifier` for class
          binarization and learn binary theories.
        - TODO: multi_class strategy of ripper: OneVsRest, remove C_i after learning rules for it

    random_state : None | int | instance of np.random.RandomState
        RNG, may be used by the algorithm. Value passed through
        `sklearn.utils.check_random_state`.

    n_jobs : int, optional
        Passed to `OneVsRestClassifier` or `OneVsOneClassifier` if these are
        used.

    Attributes
    -----
    base_estimator_ : estimator instance
        The estimator object that all tasks are delegated to. One of
        `sklearn.multiclass.OneVsRestClassifier`,
        `sklearn.multiclass.OneVsOneClassifier` or
        `sklearn_seco.util.TargetTransformingMetaEstimator` if demanded by the
        `multi_class_` strategy, a `_BaseSeCoEstimator` otherwise.

    multi_class_ : callable or str
        The actual strategy used on a non-binary problem. Relevant if
        `multi_class=None` demanded auto-selection.

    classes_ : np.ndarray
        `np.unique(y)`

    See Also
    -----
    `_BaseSeCoEstimator`
    """

    algorithm_config: Type[SeCoAlgorithmConfiguration]

    # TODO: _BaseSeCoEstimator.export_text equivalent inverting binarization & target transformation for display

    def _more_tags(self):
        # tell sklearn >= 0.21 that we can handle categorical data
        return {'X_types': ['2darray', 'categorical'], 'allow_nan': True}

    def __init__(self, multi_class=None, random_state=1, n_jobs=1):
        self.multi_class = multi_class
        self.random_state = random_state
        self.n_jobs = n_jobs

    def fit(self, X, y, **kwargs):
        """Learn SeCo theory/theories on training data `X, y`.

        For possible parameters (`**kwargs`), refer to
        :class:`_BaseSeCoEstimator`.
        """
        X, y = check_X_y(X, y, force_all_finite='allow-nan')
        self.multi_class_ = self.multi_class
        self.base_estimator_ = _BaseSeCoEstimator(
            self.algorithm_config, random_state=self.random_state, **kwargs)

        # NOTE: if using multiprocessing (e.g. through OvO or OvR), all
        #   sub-estimators share the same random seed/state.
        #   I think this should not harm.

        def wrapper_ordering_classes_by_size(estimator):
            # BySizeLabelEncoder ensures:  first class = default = biggest
            # and that classes form an integer range [0..n_classes_)
            return TargetTransformingMetaEstimator(BySizeLabelEncoder(),
                                                   estimator)

        self.classes_ = np.unique(y)
        n_classes_ = self.classes_.size
        if n_classes_ == 1:
            raise ValueError("SeCoEstimator requires 2 or more distinct "
                             "classes. Only 1 class (%s) present." %
                             self.classes_[0])
        elif n_classes_ == 2:
            self.base_estimator_ = wrapper_ordering_classes_by_size(
                self.base_estimator_)
        else:  # n_classes_ > 2
            if self.multi_class_ is None:
                # default / auto-selection
                if self.algorithm_config.direct_multiclass_support():
                    self.multi_class_ = "direct"
                else:
                    self.multi_class_ = "one_vs_rest"

            if callable(self.multi_class_):
                self.base_estimator_ = self.multi_class_(self.base_estimator_)
            elif self.multi_class_ == "one_vs_rest":
                self.base_estimator_ = OneVsRestClassifier(
                    self.base_estimator_, n_jobs=self.n_jobs)
            elif self.multi_class_ == "one_vs_one":
                self.base_estimator_ = OneVsOneClassifier(self.base_estimator_,
                                                          n_jobs=self.n_jobs)
            elif self.multi_class_ == "direct":
                # TODO: if self.multi_class=='direct' (not `None` auto-detect), only assertion prevents binary-only learner to silently learn on multiclass training data
                self.base_estimator_ = wrapper_ordering_classes_by_size(
                    self.base_estimator_)
            else:
                raise ValueError("Unknown multi-class mode %s" %
                                 self.multi_class_)

        # NOTE: param categorical_features is data dependent, but OvR/OvO don't
        #   pass extra parameters through fit(), so it has to be in
        #   `_BaseSeCoEstimator.__init__`.
        self.base_estimator_.fit(X, y)
        return self

    def predict(self, X):
        """Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)

        Returns
        -------
        C : array, shape = (n_samples,)
            Predicted target values for X, values are from ``classes_``
        """
        check_is_fitted(self, ["classes_"])
        X = check_array(X, force_all_finite='allow-nan')
        return self.base_estimator_.predict(X)

    @if_delegate_has_method('base_estimator_')
    def predict_proba(self, X):
        # noinspection PyUnresolvedReferences
        return self.base_estimator_.predict_proba(X)

    @if_delegate_has_method('base_estimator_')
    def decision_function(self, X):
        # noinspection PyUnresolvedReferences
        return self.base_estimator_.decision_function(X)

    def get_seco_estimators(self) -> Sequence[_BaseSeCoEstimator]:
        """
        :return: The `_BaseSeCoEstimator` instances that were trained.
            Depending on the multi-class strategy, the class labels they use
            differ in order and value.
            Cannot be used when self.multi_class_ is a callable.
        """
        check_is_fitted(self, 'base_estimator_')
        is_binary = len(self.classes_) == 2
        if is_binary or self.multi_class_ == "direct":
            assert isinstance(self.base_estimator_,
                              TargetTransformingMetaEstimator)
            return [self.base_estimator_.estimator]
        elif self.multi_class_ == "one_vs_rest":
            assert isinstance(self.base_estimator_, OneVsRestClassifier)
            return self.base_estimator_.estimators_
        elif self.multi_class_ == "one_vs_one":
            assert isinstance(self.base_estimator_, OneVsOneClassifier)
            return self.base_estimator_.estimators_
        else:
            assert False, "invalid state: unknown type of base_estimator_ " \
                f"({str(self.base_estimator_)})"
Example #9
0
class RobustWeightedClassifier(BaseEstimator, ClassifierMixin):
    """Algorithm for robust classification using reweighting algorithm.

    This model use iterative reweighting of samples to make a regression or
    classification estimator robust.

    The principle of the algorithm is to use an empirical risk minimization
    principle where the risk is estimated using a robust estimator (for example
    Huber estimator or median-of-means estimator)[1], [3]. The idea behind this
    algorithm was mentioned before in [2].
    This idea translates in an iterative algorithm where the sample_weight
    are changed at each iterations and are dependent of the sample. Informally
    the outliers should have small weight while the inliers should have big
    weight, where outliers are sample with a big loss function.

    This algorithm enjoy a non-zero breakdown-point (it can handle arbitrarily
    bad outliers). When the "mom" weighting scheme is used, k outliers can be
    tolerated. When the "Huber" weighting scheme is used, asymptotically the
    number of outliers has to be less than half the sample size.

    Read more in the :ref:`User Guide <robust>`.

    Parameters
    ----------

    weighting : string, default="huber"
        Weighting scheme used to make the estimator robust.
        Can be 'huber' for huber-type weights or  'mom' for median-of-means
        type weights.

    max_iter : int, default=100
        Maximum number of iterations.
        For more information, see the optimization scheme of base_estimator
        and the eta0 and burn_in parameter.

    burn_in : int, default=10
        Number of steps used without changing the learning rate.
        Can be useful to make the weight estimation better at the beginning.

    eta0 : float, default=0.01
        Constant step-size used during the burn_in period. Used only if
        burn_in>0. Can have a big effect on efficiency.

    c : float>0 or None, default=None
        Parameter used for Huber weighting procedure, used only if weightings
        is 'huber'. Measure the robustness of the weighting procedure. A small
        value of c means a more robust estimator.
        Can have a big effect on efficiency.
        If None, c is estimated at each step using half the Inter-quartile
        range, this tends to be conservative (robust).

    k : int < sample_size/2, default=1
        Parameter used for mom weighting procedure, used only if weightings
        is 'mom'. 2k+1 is the number of blocks used for median-of-means
        estimation, higher value of k means a more robust estimator.
        Can have a big effect on efficiency.
        If None, k is estimated using the number of points distant from the
        median of means of more than 2 times a robust estimate of the scale
        (using the inter-quartile range), this tends to be conservative
        (robust).

    loss : string, None or callable, default="log"
        Name of the loss used, must be the same loss as the one optimized in
        base_estimator.
        Classification losses supported : 'log', 'hinge'.
        If 'log', then the base_estimator must support predict_proba.
        Regression losses supported : 'squared_loss', .

    sgd_args : dict, default={}
        arguments of the SGDClassifier base estimator.

    multi_class : string, default="ovr"
        multi-class scheme. Can be either "ovo" for OneVsOneClassifier or "ovr"
        for OneVsRestClassifier or "binary" for binary classification.

    n_jobs : int, default=1
        number of jobs used in the multi-class meta-algorithm computation.

    tol : float or None, (default = 1e-3)
        The stopping criterion. If it is not None, training will stop when
        (loss > best_loss - tol) for n_iter_no_change consecutive epochs.

    n_iter_no_change : int, default=10
        Number of iterations with no improvement to wait before early stopping.

    random_state : int, RandomState instance or None, optional (default=None)
        The seed of the pseudo random number generator to use when shuffling
        the data. If int, random_state is the seed used by the random number
        generator; If RandomState instance, random_state is the random number
        generator; If None, the random number generator is the RandomState
        instance used by np.random.



    Attributes
    ----------

    classes_ : ndarray of shape (n_classes, )
        A list of class labels known to the classifier.

    coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
        Coefficient of the features in the decision function. Only available if
        multi_class = "binary"

    intercept_ : ndarray of shape (1,) or (n_classes,)
        Intercept (a.k.a. bias) added to the decision function.
        Only available if multi_class = "binary"

    n_iter_ : ndarray of shape (n_classes,) or (1, )
        Actual number of iterations for all classes. If binary or multinomial,
        it returns only 1 element. For liblinear solver, only the maximum
        number of iteration across all classes is given.

    base_estimator_ : object,
        The fitted base estimator SGDCLassifier.

    weights_ : array like, length = n_sample.
        Weight of each sample at the end of the algorithm. Can be used as a
        measure of how much of an outlier a sample is. Only available if
        multi_class = "binary"


    Notes
    -----

    Often, there is a need to use RobustScaler as preprocessing.

    Examples
    --------

    >>> from sklearn_extra.robust import RobustWeightedClassifier
    >>> from sklearn.datasets import make_blobs
    >>> import numpy as np
    >>> rng = np.random.RandomState(42)
    >>> X,y = make_blobs(n_samples=100, centers=np.array([[-1, -1], [1, 1]]),
    ...                  random_state=rng)
    >>> clf=RobustWeightedClassifier()
    >>> _ = clf.fit(X, y)
    >>> score = np.mean(clf.predict(X)==y)

    References
    ----------

    [1] Guillaume Lecué, Matthieu Lerasle and Timothée Mathieu.
        "Robust classification via MOM minimization", Mach Learn 109, (2020).
        https://doi.org/10.1007/s10994-019-05863-6 (2018).
        arXiv:1808.03106

    [2] Christian Brownlees, Emilien Joly and Gábor Lugosi.
        "Empirical risk minimization for heavy-tailed losses", Ann. Statist.
        Volume 43, Number 6 (2015), 2507-2536.

    [3] Stanislav Minsker and Timothée Mathieu.
        "Excess risk bounds in robust empirical risk minimization"
        arXiv preprint (2019). arXiv:1910.07485.

    """

    def __init__(
        self,
        weighting="huber",
        max_iter=100,
        burn_in=10,
        eta0=0.01,
        c=None,
        k=0,
        loss="log",
        sgd_args=None,
        multi_class="ovr",
        n_jobs=1,
        tol=1e-3,
        n_iter_no_change=10,
        random_state=None,
    ):
        self.weighting = weighting
        self.max_iter = max_iter
        self.burn_in = burn_in
        self.eta0 = eta0
        self.c = c
        self.k = k
        self.loss = loss
        self.sgd_args = sgd_args
        self.multi_class = multi_class
        self.n_jobs = n_jobs
        self.tol = tol
        self.n_iter_no_change = n_iter_no_change
        self.random_state = random_state

    def fit(self, X, y):
        """Fit the model to data matrix X and target(s) y.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            The input data.

        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).

        Returns
        -------
        self : returns an estimator trained with RobustWeightedClassifier.
        """

        if self.sgd_args is None:
            sgd_args = {}
        else:
            sgd_args = self.sgd_args

        # Define the base estimator
        base_robust_estimator_ = _RobustWeightedEstimator(
            SGDClassifier(**sgd_args, loss=self.loss),
            weighting=self.weighting,
            loss=self.loss,
            burn_in=self.burn_in,
            c=self.c,
            k=self.k,
            eta0=self.eta0,
            max_iter=self.max_iter,
            tol=self.tol,
            n_iter_no_change=self.n_iter_no_change,
            random_state=self.random_state,
        )

        if self.multi_class == "ovr":
            self.base_estimator_ = OneVsRestClassifier(
                base_robust_estimator_, n_jobs=self.n_jobs
            )
        elif self.multi_class == "binary":
            self.base_estimator_ = base_robust_estimator_
        elif self.multi_class == "ovo":
            self.base_estimator_ = OneVsOneClassifier(
                base_robust_estimator_, n_jobs=self.n_jobs
            )
        else:
            raise ValueError("No such multiclass method implemented.")

        self.base_estimator_.fit(X, y)
        if self.multi_class == "binary":
            self.weights_ = self.base_estimator_.weights_
            self.coef_ = self.base_estimator_.coef_
            self.intercept_ = self.base_estimator_.intercept_
        self.n_iter_ = self.max_iter * len(X)
        self.classes_ = self.base_estimator_.classes_
        return self

    def predict(self, X):
        """Predict using the estimator trained with RobustWeightedClassifier.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        Returns
        -------
        y : array-like, shape (n_samples, n_outputs)
            The predicted values.
        """

        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.predict(X)

    def _check_proba(self):
        if self.loss != "log":
            raise AttributeError(
                "Probability estimates are not available for"
                " loss=%r" % self.loss
            )

    @property
    def predict_proba(self):
        """
        Probability estimates when binary classification.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Vector to be scored, where `n_samples` is the number of samples and
            `n_features` is the number of features.

        Returns
        -------
        T : array-like of shape (n_samples, n_classes)
            Returns the probability of the sample for each class in the model,
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        self._check_proba()
        return self._predict_proba

    def _predict_proba(self, X):
        return self.base_estimator_.predict_proba(X)

    @property
    def _estimator_type(self):
        return self.base_estimator._estimator_type

    def score(self, X, y=None):
        """Returns the score on the given data, using
        ``base_estimator_.score``.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input data, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like of shape (n_samples, n_output) or (n_samples,), optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        Returns
        -------
        score : float
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.score(X, y)

    def decision_function(self, X):
        """Predict using the linear model

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)

        Returns
        -------
        array, shape (n_samples,)
           Predicted target values per element in X.
        """
        check_is_fitted(self, attributes=["base_estimator_"])
        return self.base_estimator_.decision_function(X)