def fit_transform(self,X,y=None):
        """
        Fit an sklearn classifier to data

        Parameters
        ----------

        X : pandas dataframe or array-like
           training samples
        y : array like, required for array-like X and not used presently for pandas dataframe
           class labels

        Returns
        -------
        self: object

        """
        if isinstance(X,pd.DataFrame):
            df = X
            (X,y,self.vectorizer) = self.convert_numpy(df)
        else:
            check_X_y(X,y)

        self.clf.fit(X,y)
        return self
Beispiel #2
0
def check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred,
                           y_test_pred):
    """Internal shape to check input data shapes are consistent.

    Parameters
    ----------
    X_train : numpy array of shape (n_samples, n_features)
        The training samples.

    y_train : list or array of shape (n_samples,)
        The ground truth of training samples.

    X_test : numpy array of shape (n_samples, n_features)
        The test samples.

    y_test : list or array of shape (n_samples,)
        The ground truth of test samples.

    y_train_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the training samples.

    y_test_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the test samples.

    Returns
    -------
    X_train : numpy array of shape (n_samples, n_features)
        The training samples.

    y_train : list or array of shape (n_samples,)
        The ground truth of training samples.

    X_test : numpy array of shape (n_samples, n_features)
        The test samples.

    y_test : list or array of shape (n_samples,)
        The ground truth of test samples.

    y_train_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the training samples.

    y_test_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the test samples.
    """

    # check input data shapes are consistent
    X_train, y_train = check_X_y(X_train, y_train)
    X_test, y_test = check_X_y(X_test, y_test)

    y_test_pred = column_or_1d(y_test_pred)
    y_train_pred = column_or_1d(y_train_pred)

    check_consistent_length(y_train, y_train_pred)
    check_consistent_length(y_test, y_test_pred)

    if X_train.shape[1] != X_test.shape[1]:
        raise ValueError("X_train {0} and X_test {1} have different number "
                         "of features.".format(X_train.shape, X_test.shape))

    return X_train, y_train, X_test, y_test, y_train_pred, y_test_pred
Beispiel #3
0
    def fit(self,X,y=None):
        """Fit a model: 

        Parameters
        ----------

        X : pandas dataframe or array-like
           training samples. If pandas dataframe can handle dict of feature in one column or cnvert a set of columns
        y : array like, required for array-like X and not used presently for pandas dataframe
           class labels

        Returns
        -------
        self: object


        """
        if isinstance(X,pd.DataFrame):
            df = X
            if not self.dict_feature is None:
                if not self.target_readable is None:
                    self.create_class_id_map(df,self.target,self.target_readable)
                (X,y) = self._load_from_dict(df)
                num_class = len(np.unique(y))
            else:
                (X,y,self.vectorizer) = self.convert_numpy(df)
                num_class = len(y.unique())
        else:
            check_X_y(X,y)
            num_class = len(np.unique(y))

        self.clf = xgb.XGBClassifier(**self.params)
        print self.clf.get_params(deep=True)
        self.clf.fit(X,y,verbose=True)
        return self
def test_check_array_warn_on_dtype_deprecation():
    X = np.asarray([[0.0], [1.0]])
    Y = np.asarray([[2.0], [3.0]])
    with pytest.warns(DeprecationWarning,
                      match="'warn_on_dtype' is deprecated"):
        check_array(X, warn_on_dtype=True)
    with pytest.warns(DeprecationWarning,
                      match="'warn_on_dtype' is deprecated"):
        check_X_y(X, Y, warn_on_dtype=True)
Beispiel #5
0
    def fit(self,X,y=None):
        """Derived from https://github.com/fchollet/keras/blob/master/keras/wrappers/scikit_learn.py
        Adds:
        Handling pandas inputs
        Saving of model into the class to allow for easy pickling

        Parameters
        ----------

        X : pandas dataframe or array-like
           training samples
        y : array like, required for array-like X and not used presently for pandas dataframe
           class labels

        Returns
        -------
        self: object

        """
        if isinstance(X,pd.DataFrame):
            df = X
            (X,y,self.vectorizer) = self.convert_numpy(df)
        else:
            check_X_y(X,y)

        input_width = X.shape[1]
        num_classes = len(y.unique())
        logger.info("input_width %d",input_width)
        logger.info("num_classes %d",num_classes)
        train_y = np_utils.to_categorical(y, num_classes)
        self.model = self.model_create(input_width,num_classes)

        if len(y.shape) == 1:
            self.classes_ = list(np.unique(y))
            if self.loss == 'categorical_crossentropy':
                y = to_categorical(y)
        else:
            self.classes_ = np.arange(0, y.shape[1])
        
        if self.compiled_model_ is None:
            self.compiled_model_ = copy.deepcopy(self.model)
            self.compiled_model_.compile(optimizer=self.optimizer, loss=self.loss)
        history = self.compiled_model_.fit(
            X, y, batch_size=self.train_batch_size, nb_epoch=self.nb_epoch, verbose=self.verbose,
            shuffle=self.shuffle, show_accuracy=self.show_accuracy,
            validation_split=self.validation_split, validation_data=self.validation_data,
            callbacks=self.callbacks)

        self.config_ = self.model.to_json()
        self.compiled_model_.save_weights(self.tmp_model)
        with open(self.tmp_model, mode='rb') as file: # b is important -> binary
            self.model_saved = file.read()
        return self
Beispiel #6
0
    def fit(self,X,y=None):
        """Convert data to vw lines and then train for required iterations
           
        Parameters
        ----------

        X : pandas dataframe or array-like
           training samples
        y : array like, required for array-like X and not used presently for pandas dataframe
           class labels

        Returns
        -------
        self: object

        Caveats : 
        1. A seldon specific fork of wabbit_wappa is needed to allow vw to run in server mode without save_resume. Save_resume seems to cause issues with the scores returned. Maybe connected to https://github.com/JohnLangford/vowpal_wabbit/issues/262
        """
        if isinstance(X,pd.DataFrame):
            df = X
            df_base = self._exclude_include_features(df)
            df_base = df_base.fillna(0)
        else:
            check_X_y(X,y)
            df = pd.DataFrame(X)
            df_y = pd.DataFrame(y,columns=list('y'))
            self.target='y'
            df_base = pd.concat([df,df_y],axis=1)
            print df_base.head()

        min_target = df_base[self.target].astype(float).min()
        print "min target ",min_target
        if min_target == 0:
            self.zero_based = True
        else:
            self.zero_based = False
        if not self.target_readable is None:
            self.create_class_id_map(df,self.target,self.target_readable,zero_based=self.zero_based)

        self.num_classes = len(df_base[self.target].unique())
        print "num classes ",self.num_classes
        self._start_vw_if_needed("train")
        df_vw = df_base.apply(self._convert_row,axis=1)
        for i in range(0,self.num_iterations):
            for (index,val) in df_vw.iteritems():
                self.vw.send_line(val,parse_result=False)
        self._save_model(self.model_file)        
        return self
Beispiel #7
0
    def fit(self, X, y):
        """Fit joint quantile regression model.

        Parameters
        ----------
        inputs : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training data.
        targets : {array-like}, shape = [n_samples]
            Target values.

        Returns
        -------
        self : returns an instance of self.
        """
        if self.eps > 0 and self.nc_const:
            raise UserWarning("eps is considered null because you chose to "
                              "enfoce non-crossing constraints.")
        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], y_numeric=True)
        y = asarray(y).flatten()
        self._validate_params()

        self.linop_ = self._get_kernel_map(X)
        gram = self.linop_.Gram_dense(X)
        self.reg_c_ = 1. / self.lbda

        # Solve the optimization problem
        # probs = asarray(self.probs).reshape((-1, 1))
        probs = asarray(self.probs).flatten()
        if self.nc_const:
            self._qp_nc(gram, y, probs)
        else:
            self._coneqp(gram, y, probs)
        return self
    def fit(self, X, y=None):
        """Fit the model using X as training data.

            Parameters
            ----------
            X : {array-like, sparse matrix}, optional
                Training data. If array or matrix, shape = [n_samples, n_features]
                If X is None, a "lazy fitting" is performed. If kneighbors is called, the fitting
                with with the data there is done. Also the caching of computed hash values is deactivated in
                this case.
            y : list, optional (default = None)
                List of classes for the given input of X. Size have to be n_samples."""
        
        if y is not None:
            self._y_is_csr = True
            _, self._y = check_X_y(X, y, "csr", multi_output=True)
            if self._y.ndim == 1 or self._y.shape[1] == 1:
                self._y_is_csr = False
        else:
            self._y_is_csr = False
        X_csr = csr_matrix(X)
       
        self._index_elements_count = X_csr.shape[0]
        instances, features = X_csr.nonzero()
        maxFeatures = int(max(X_csr.getnnz(1)))
        
        data = X_csr.data
        
        # returns a pointer to the inverse index stored in c++
        self._pointer_address_of_nearestNeighbors_object = _nearestNeighbors.fit(instances.tolist(), features.tolist(), data.tolist(), 
                                                    X_csr.shape[0], maxFeatures,
                                                    self._pointer_address_of_nearestNeighbors_object)
Beispiel #9
0
 def fit(self,X,y):
     '''
     Fits variational relevance vector regression
             
     Parameters
     -----------
     X: array-like of size [n_samples, n_features]
        Training data, matrix of explanatory variables
     
     y: array-like of size [n_samples, n_features] 
        Target values
        
     Returns
     -------
     self : object
         Returns self.
     '''
     X,y = check_X_y(X,y, dtype = np.float64)
     # kernelise features
     K = self._get_kernel( X, X)
     # use fit method of RegressionARD
     _ = super(VRVR,self).fit(K,y)
     self.relevant_  = np.where(self.active_== True)[0]
     if X.ndim == 1:
         self.relevant_vectors_ = X[self.relevant_]
     else:
         self.relevant_vectors_ = X[self.relevant_,:]
     return self
Beispiel #10
0
 def anotherfit(self, X, y):
     X,y=check_X_y(X,y)
         
     GaussianNB.fit(self,X,y)
 
     for name in self.equivalent:
         super(GaussianNB,self).__setattr__(name,self.__getattribute__(self.equivalent[name]))
def my_smote(X, y, minority_target=None, per=0.5):
    """
    This object is an implementation of SMOTE - Synthetic Minority
    Over-sampling Technique, and the variations Borderline SMOTE 1, 2 and
    SVM-SMOTE.
    :param X: nd-array, sparse matrix, shape=[n_samples, n_features]
    :param y: nd-array, list, shape=[n_samples]
    :param minority_target: list
    :param per
    :return:
    """
    X, Y = check_X_y(X, y, 'csr')
    unique_label = list(set(Y))
    label_count = [np.sum(Y == i) for i in unique_label]

    if minority_target is None:
        minority_index = [np.argmin(label_count)]
    else:
        minority_index = [unique_label.index(target) for target in minority_target]

    majority = np.max(label_count)
    for i in minority_index:
        N = (int((majority * 1.0 / (1 - per) - majority) / label_count[i]) - 1) * 100
        safe, synthetic, danger = _smote._borderlineSMOTE(X, Y, unique_label[i], N, k=5)
        syn_label = np.array([unique_label[i]] * synthetic.shape[0])
        X = sp.vstack([X, synthetic])
        Y = np.concatenate([Y, syn_label])

    return X, Y
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        print("c=%s, cov_algo=%s" % (self.c, self.cov_algo))

        classes = np.unique(y)
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        self.class_prior_ = np.zeros(n_classes)
        self.class_count_ = np.zeros(n_classes)
        unique_y = np.unique(y)

        for y_i in unique_y:
            i = classes.searchsorted(y_i)
            X_i = X[y == y_i, :]
            sw_i = None
            N_i = X_i.shape[0]

            self.class_count_[i] += N_i

        self.class_prior_[:] = self.class_count_ / np.sum(self.class_count_)
        self.priors = self.class_prior_

        self.posteriors = []

        for klass in self.classes_:
            examples = self._examples_for_class(klass, X, y)
            mean = np.array(examples.mean(0))[0]
            cov = self._cov(examples)
            cov_smoothed = cov + (self.c * np.eye(mean.shape[0]))
            p_x = multivariate_normal(mean=mean, cov=cov_smoothed)
            self.posteriors.append(p_x)
        return self
Beispiel #13
0
    def fit(self, X, y):
        """Fit ORFF ridge regression model.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training data.

        y : {array-like}, shape = [n_samples] or [n_samples, n_targets]
            Target values.

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y, ['csr', 'csc', 'coo'],
                         y_numeric=True, multi_output=True)
        self._validate_params()
        self.p = y.shape[1] if y.ndim > 1 else 1

        solver_params = self.solver_params or {}

        self.linop_ = self._get_kernel(X, y)
        self.phix_ = self.linop_.get_orff_map(X, self.D)
        risk = ORFFRidgeRisk(self.lbda, 'LS')
        self.solver_res_ = minimize(risk.functional_grad_val,
                                    zeros(self.phix_.shape[1],
                                          dtype=X.dtype),
                                    args=(y.ravel(), self.phix_, self.linop_),
                                    method=self.solver,
                                    jac=True, options=solver_params)
        self.coefs_ = self.solver_res_.x
        return self
Beispiel #14
0
 def fit(self, x, y):
     """
     Constructs GAM model(s) to predict y from X
     
     x: 1 or 2 dimensional array of predictor values with each row being one observation
     y: 1 or 2 dimensional array of predicted values (a GAM model is constructed for each output if y is 2 dimensional)
     """
     # Input validation for standard estimators using sklearn utils
     x, y = check_X_y(x, y, accept_sparse=["csr", "csc", "coo"], multi_output=True)
     # Convert to R matrices
     if (
         x.ndim == 1
     ):  # If we're only looking at 1 x at a time, shape[1] will give an error for one-dimensional arrays. Sklearn input validation doesn't change that.
         rX = r.matrix(x, nrow=x.shape[0], ncol=1)
     else:
         rX = r.matrix(x, nrow=x.shape[0], ncol=x.shape[1])
     if (
         y.ndim == 1
     ):  # If we're only looking at 1 y at a time, shape[1] will give an error for one-dimensional arrays
         rY = r.matrix(y, nrow=y.shape[0], ncol=1)
     else:
         rY = r.matrix(y, nrow=y.shape[0], ncol=y.shape[1])
     # Compute models (one for each column in y)
     self.gammodels = self.computeGAM(rX, rY)
     return self
Beispiel #15
0
    def _check_params(self, X, y):
        # checking input data and scaling it if y is continuous
        X, y = check_X_y(X, y)
        
        if not self.categorical:
            ss = StandardScaler()
            X = ss.fit_transform(X)
            y = ss.fit_transform(y)

        # sanity checks
        methods = ['JMI', 'JMIM', 'MRMR']
        if self.method not in methods:
            raise ValueError('Please choose one of the following methods:\n' +
                             '\n'.join(methods))

        if not isinstance(self.k, int):
            raise ValueError("k must be an integer.")
        if self.k < 1:
            raise ValueError('k must be larger than 0.')
        if self.categorical and np.any(self.k > np.bincount(y)):
            raise ValueError('k must be smaller than your smallest class.')

        if not isinstance(self.categorical, bool):
            raise ValueError('Categorical must be Boolean.')
        if self.categorical and np.unique(y).shape[0] > 5:
            print 'Are you sure y is categorical? It has more than 5 levels.'
        if not self.categorical and self._isinteger(y):
            print 'Are you sure y is continuous? It seems to be discrete.'
        if self._isinteger(X):
            print ('The values of X seem to be discrete. MI_FS will treat them'
                   'as continuous.')
        return X, y
Beispiel #16
0
 def fit(self,X,y):
     '''
     Fits L2VM model
     
     Parameters:
     -----------
     X: numpy array of size 'n x m'
        Matrix of explanatory variables
        
     Y: numpy array of size 'n x '
        Vector of dependent variable
     
     Return
     ------
     obj: self
       self
     '''
     X,y = check_X_y(X,y, dtype = np.float64)
     K   = get_kernel(X, X, self.gamma, self.degree, self.coef0, self.kernel, 
                      self.kernel_params )
     self._model = LogisticRegression( penalty = "l1", dual = False, C = self.C, 
                                       tol = self.tol, fit_intercept = self.fit_intercept,
                                       intercept_scaling=self.intercept_scaling,
                                       n_jobs = self.n_jobs, solver = 'liblinear',
                                       multi_class = 'ovr', max_iter = self.max_iter,
                                       verbose = self.verbose, random_state = self.random_state)
     self._model = self._model.fit(K,y)
     self.relevant_indices_ = [np.where(coefs!=0)[0] for coefs in self._model.coef_] 
     self.relevant_vectors_ = [X[rvi,:] for rvi in self.relevant_indices_]
     self.classes_  = self._model.classes_
     return self
Beispiel #17
0
 def fit(self,X,y):
     '''
     Fits ElasticNet Regression with kernelised features
     
     Parameters
     ----------
     X: array-like of size [n_samples, n_features]
        Matrix of explanatory variables
        
     y: array-like of size (n_samples,)
        Vector of dependent variable
     
     Returns
     -------
     obj: self
       self
     '''
     X,y = check_X_y(X,y, dtype = np.float64)
     K   = get_kernel(X, X, self.gamma, self.degree, self.coef0, self.kernel, 
                      self.kernel_params )
     model = ElasticNet(self.alpha, self.l1_ratio, self.fit_intercept,
                        self.normalize, self.precompute, self.max_iter,
                        self.copy_X, self.tol, self.warm_start, self.positive,
                        self.random_state, self.selection)
     self._model = model.fit(K,y)
     self.relevant_indices_ = np.where(self._model.coef_ != 0)[0]
     self.relevant_vectors_ = X[self.relevant_indices_,:]
     return self
Beispiel #18
0
    def fit(self, X, y):
        X, y = check_X_y(X, y, dtype=np.int64, accept_sparse='csr')

        n_rows = X.shape[0]
        self.classes_ = np.unique(y)

        if sp.sparse.issparse(X):
            if self.debug: print('Features are sparse, choosing faster learning')

            self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm,self.k, X.shape[1]),
                                              format='Sparse', debug=True, sklearn=True, flushdir=self.flushdir,
                                              flushthreshold=20000, normalize=self.normalize)

            for i in range(n_rows):
                sparse = ['({},{})'.format(i+1, c) for i,c in zip(X[i].indices, X[i].data)]
                self.classifier.append(sparse,str(y[i]))

        else:

            self.classifier = TimblClassifier(self.prefix, "-a{} -k{} -N{} -vf".format(self.algorithm, self.k, X.shape[1]),
                                              debug=True, sklearn=True, flushdir=self.flushdir, flushthreshold=20000,
                                              normalize=self.normalize)

            if y.dtype != 'O':
                y = y.astype(str)

            for i in range(n_rows):
                self.classifier.append(list(X[i].toarray()[0]), y[i])

        self.classifier.train()
        return self
Beispiel #19
0
    def sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """

        # Check the consistency of X and y
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])

        check_is_fitted(self, 'ratio_')
        self._check_X_y(X, y)

        return self._sample(X, y)
Beispiel #20
0
    def fit(self, X, y):
        """Fit ONORMA model.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training data.

        y : {array-like}, shape = [n_samples] or [n_samples, n_targets]
            Target values.

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y, False, y_numeric=True, multi_output=True)
        self._validate_params()
        self.T_ = X.shape[0] if self.T is None else self.T

        self.t_ = 0
        if y.ndim > 1:
            self.coefs_ = zeros(self.T_ * y.shape[1])
            for i in range(self.T_):
                idx = i % X.shape[0]
                self.partial_fit(X[idx, :], y[idx, :])
        else:
            self.coefs_ = zeros(self.T_)
            for i in range(self.T_):
                idx = i % X.shape[0]
                self.partial_fit(X[idx, :], y[idx])
        return self
Beispiel #21
0
 def fit(self,X,y):
     '''
     Fit Relevance Vector Regression Model
     
     Parameters
     -----------
     X: {array-like,sparse matrix} of size [n_samples, n_features]
        Training data, matrix of explanatory variables
     
     y: array-like of size [n_samples, n_features] 
        Target values
        
     Returns
     -------
     self: object
        self
     '''
     X,y = check_X_y(X,y, accept_sparse = ['csr','coo','bsr'], dtype = np.float64)
     # kernelise features
     K = get_kernel( X, X, self.gamma, self.degree, self.coef0, 
                    self.kernel, self.kernel_params)
     # use fit method of RegressionARD
     _ = super(RVR,self).fit(K,y)
     # convert to csr (need to use __getitem__)
     convert_tocsr = [scipy.sparse.coo.coo_matrix, scipy.sparse.dia.dia_matrix,
                      scipy.sparse.bsr.bsr_matrix]
     if type(X) in convert_tocsr:
         X = X.tocsr()
     self.relevant_  = np.where(self.active_== True)[0]
     if X.ndim == 1:
         self.relevant_vectors_ = X[self.relevant_]
     else:
         self.relevant_vectors_ = X[self.relevant_,:]
     return self
Beispiel #22
0
    def fit(self,X,y):
        '''
        Fits Logistic Regression with ARD
        
        Parameters
        ----------
        X: array-like of size [n_samples, n_features]
           Training data, matrix of explanatory variables
        
        y: array-like of size [n_samples] 
           Target values
           
        Returns
        -------
        self : object
            Returns self.
        '''
        X, y = check_X_y(X, y, accept_sparse = None, dtype=np.float64)
        n_samples, n_features = X.shape

        # preprocess features
        self._X_mean = np.zeros(n_features)
        self._X_std  = np.ones(n_features)
        if self.normalize:
            self._X_mean, self._X_std = np.mean(X,0), np.std(X,0)
        X = (X - self._X_mean) / self._X_std
        if self.fit_intercept:
            X = np.concatenate((np.ones([n_samples,1]),X),1)
            n_features += 1
        
        # preprocess targets
        check_classification_targets(y)
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        if n_classes < 2:
            raise ValueError("Need samples of at least 2 classes"
                             " in the data, but the data contains only one"
                             " class: %r" % self.classes_[0])
        
        # if multiclass use OVR (i.e. fit classifier for each class)
        self.coef_,self.active_ ,self.lambda_= list(),list(),list()
        self.intercept_, self.sigma_ = list(),list()            
        for pos_class in self.classes_:
            if n_classes == 2:
                pos_class = self.classes_[1]
            mask = (y == pos_class)
            y_bin = np.zeros(y.shape, dtype=np.float64)
            y_bin[mask] = 1
            coef_, intercept_, active_ , sigma_ , A  = self._fit(X,y_bin,
                                                       n_samples,n_features)
            self.coef_.append(coef_)
            self.active_.append(active_)
            self.intercept_.append(intercept_)
            self.sigma_.append(sigma_)
            self.lambda_.append(A)
            # in case of binary classification fit only one classifier           
            if n_classes == 2:
                break
            
        return self
    def fit(self,X,y):
        '''
        Fits variational Bayesian Logistic Regression
        
        Parameters
        ----------
        X: array-like of size [n_samples, n_features]
           Matrix of explanatory variables
           
        y: array-like of size [n_samples]
           Vector of dependent variables

        Returns
        -------
        self: object
           self
        '''
        # preprocess data
        X,y = check_X_y( X, y , dtype = np.float64)
        check_classification_targets(y)
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        
        # take into account bias term if required 
        n_samples, n_features = X.shape
        n_features = n_features + int(self.fit_intercept)
        if self.fit_intercept:
            X = np.hstack( (np.ones([n_samples,1]),X))
        
        # handle multiclass problems using One-vs-Rest 
        if n_classes < 2:
            raise ValueError("Need samples of at least 2 classes")
        if n_classes > 2:
            self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes
            self.intercept_         = [0]*n_classes
        else:
            self.coef_, self.sigma_, self.intercept_ = [0],[0],[0]
        
        # huperparameters of 
        a  = self.a + 0.5 * n_features
        b  = self.b
        
        for i in range(len(self.coef_)):
            if n_classes == 2:
                pos_class = self.classes_[1]
            else:
                pos_class   = self.classes_[i]
            mask            = (y == pos_class)
            y_bin           = np.ones(y.shape, dtype=np.float64)
            y_bin[~mask]    = 0
            coef_, sigma_   = self._fit(X,y_bin,a,b)
            intercept_ = 0
            if self.fit_intercept:
                intercept_  = coef_[0]
                coef_       = coef_[1:]
            self.coef_[i]   = coef_
            self.intercept_[i] = intercept_
            self.sigma_[i]  = sigma_
        self.coef_  = np.asarray(self.coef_)
        return self
Beispiel #24
0
 def fit(self,X,y):
     '''
     Fit Relevance Vector Classifier
     
     Parameters
     -----------
     X: array-like of size [n_samples, n_features]
        Training data, matrix of explanatory variables
     
     y: array-like of size [n_samples, n_features] 
        Target values
        
     Returns
     -------
     self: object
        self
     '''
     X,y = check_X_y(X,y, accept_sparse = None, dtype = np.float64)
     # kernelise features
     K = get_kernel( X, X, self.gamma, self.degree, self.coef0, 
                    self.kernel, self.kernel_params)
     # use fit method of RegressionARD
     _ = super(RVC,self).fit(K,y)
     self.relevant_  = [np.where(active==True)[0] for active in self.active_]
     if X.ndim == 1:
         self.relevant_vectors_ = [ X[relevant_] for relevant_ in self.relevant_]
     else:
         self.relevant_vectors_ = [ X[relevant_,:] for relevant_ in self.relevant_ ]
     return self
def f_classifNumba(X, y):
    """Compute the ANOVA F-value for the provided sample.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    X : {array-like, sparse matrix} shape = [n_samples, n_features]
        The set of regressors that will tested sequentially.

    y : array of shape(n_samples)
        The data matrix.

    Returns
    -------
    F : array, shape = [n_features,]
        The set of F values.

    pval : array, shape = [n_features,]
        The set of p-values.

    See also
    --------
    chi2: Chi-squared stats of non-negative features for classification tasks.
    f_regression: F-value between label/feature for regression tasks.
    """
    X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])
    args = [X[safe_mask(X, y == k)] for k in np.unique(y)]
    return f_onewayNumba(*args)
Beispiel #26
0
    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        super(SMOTEENN, self).fit(X, y)

        # Fit using SMOTE
        self.sm.fit(X, y)

        return self
Beispiel #27
0
    def sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        super(SMOTEENN, self).sample(X, y)

        # Transform using SMOTE
        X, y = self.sm.sample(X, y)

        # Fit and transform using ENN
        return self.enn.fit_sample(X, y)
Beispiel #28
0
    def fit(self, X, y):
        X, y = check_X_y(X, y, 'csr')
        _, n_features = X.shape

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # so we don't have to cast X to floating point
        Y = Y.astype(np.float64)

        # Count raw events from data
        n_effective_classes = Y.shape[1]
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.ratios_ = np.full((n_effective_classes, n_features), self.alpha,
                                 dtype=np.float64)
        self._compute_ratios(X, Y)

        # flugglyness
        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            svm = LinearSVC(C=self.C, max_iter=self.max_iter)
            Y_i = Y[:,i]
            svm.fit(X_i, Y_i)
            self.svm_.append(svm) 

        return self
 def _check_X_y(X, y):
     """Overwrite the checking to let pass some string for categorical
     features.
     """
     y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
     X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
     return X, y, binarize_y
    def transform(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_subset, n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_subset, n_samples_new)
            The corresponding label of `X_resampled`

        idx_under : ndarray, shape (n_subset, n_samples, )
            If `return_indices` is `True`, a boolean array will be returned
            containing the which samples have been selected.

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        super(EasyEnsemble, self).transform(X, y)

        X_resampled = []
        y_resampled = []
        if self.return_indices:
            idx_under = []

        for s in range(self.n_subsets):
            if self.verbose:
                print("Creation of the set #{}".format(s))

            # Create the object for random under-sampling
            rus = RandomUnderSampler(ratio=self.ratio_,
                                     return_indices=self.return_indices,
                                     random_state=self.rs_,
                                     verbose=self.verbose,
                                     replacement=self.replacement)
            if self.return_indices:
                sel_x, sel_y, sel_idx = rus.fit_transform(X, y)
            else:
                sel_x, sel_y = rus.fit_transform(X, y)

            X_resampled.append(sel_x)
            y_resampled.append(sel_y)
            if self.return_indices:
                idx_under.append(sel_idx)

        if self.return_indices:
            return (np.array(X_resampled), np.array(y_resampled),
                    np.array(idx_under))
        else:
            return np.array(X_resampled), np.array(y_resampled)
Beispiel #31
0
def check_Xs_y(Xs,
               y,
               multiview=False,
               enforce_views=None,
               return_dimensions=False):
    r"""
    Checks Xs and y for consistent length. Xs is set to be of dimension 3.

    Parameters
    ----------
    Xs : nd-array, list
        Input data.

    y : nd-array, list
        Labels.

    multiview : boolean, (default=False)
        If True, throws error if just 1 data matrix given.

    enforce_views : int, (default=not checked)
        If provided, ensures this number of views in Xs. Otherwise not
        checked.

    return_dimensions : boolean, (default=False)
        If True, the function also returns the dimensions of the multiview
        dataset. The dimensions are n_views, n_samples, n_features where
        n_samples and n_views are respectively the number of views and the
        number of samples, and n_features is a list of length n_views
        containing the number of features of each view.

    Returns
    -------
    Xs_converted : object
        The converted and validated Xs (list of data arrays).

    y_converted : object
        The converted and validated y.

    n_views : int
        The number of views in the dataset. Returned only if
        ``return_dimensions`` is ``True``.

    n_samples : int
        The number of samples in the dataset. Returned only if
        ``return_dimensions`` is ``True``.

    n_features : list
        List of length ``n_views`` containing the number of features in
        each view. Returned only if ``return_dimensions`` is ``True``.
    """
    if return_dimensions:
        Xs_converted, n_views, n_samples, n_features = check_Xs(
            Xs,
            multiview=multiview,
            enforce_views=enforce_views,
            return_dimensions=True,
        )
    else:
        Xs_converted = check_Xs(Xs,
                                multiview=multiview,
                                enforce_views=enforce_views)
    _, y_converted = check_X_y(Xs_converted[0], y, allow_nd=False)

    if return_dimensions:
        return Xs_converted, y_converted, n_views, n_samples, n_features
    else:
        return Xs_converted, y_converted
Beispiel #32
0
    def fit(self, X, y):
        # Check if the dimensions are okay
        X, y = check_X_y(X, y)
        check_classification_targets(y)

        # Get the unique labels
        self.classes_, y = np.unique(y, return_inverse=True)
        n_samples, n_features = X.shape
        n_classes = len(self.classes_)

        # Check the number of classes
        if n_classes < 2:
            raise ValueError('y has less than 2 classes')
        if self.priors is None:
            self.priors_ = np.bincount(y) / float(n_samples)
        else:
            self.priors_ = self.priors

        cov = None
        store_covariance = self.store_covariance or self.store_covariances

        # Store the covariance if flag is true
        if store_covariance:
            cov = []
        means = []  # Stores the class means
        scalings = [
        ]  # The variance in the rotated coordinate system (scaling)
        rotations = []  # Rotation of the gaussian to principal axes

        # For all the given classes
        for ind in xrange(n_classes):
            # Subset the classes
            Xg = X[y == ind, :]
            # Find the means of the classes
            meang = Xg.mean(0)
            means.append(meang)
            if len(Xg) == 1:
                raise ValueError('y has only 1 sample in class %s, covariance '
                                 'is ill defined.' % str(self.classes_[ind]))
            # Center thr data
            Xgc = Xg - meang
            # Xgc = U * S * V.T
            U, S, Vt = np.linalg.svd(Xgc, full_matrices=False)
            rank = np.sum(S > self.tol)
            if rank < n_features:
                warnings.warn("Variables are collinear")
            S2 = (S**2) / (len(Xg) - 1)
            S2 = ((1 - self.reg_param) * S2) + self.reg_param
            if self.store_covariance or store_covariance:
                # cov = V * (S^2 / (n-1)) * V.T
                cov.append(np.dot(S2 * Vt.T, Vt))  # .T gives the transpose
            scalings.append(S2)
            rotations.append(Vt.T)

        # Get the pooled covariance matrix estimate
        self.class_covariance_ = _class_cov(X, y)

        # Store the covariance matrices
        if self.store_covariance or store_covariance:
            self.covariance_ = cov

        # Initialize total_covariance_
        self.total_covariance_ = []

        # Change the covariance matrices depending on alpha
        for ind in xrange(n_classes):
            self.total_covariance_.append(
                self.alpha * cov[ind] +
                (1 - self.alpha) * self.class_covariance_
            )  # New estimate of the covariance matrix

        # Store the other attributes
        self.means_ = np.asarray(means)
        self.scalings_ = scalings
        self.rotations_ = rotations
        return self
Beispiel #33
0
    def fit(self, X, y, groups=None, sample_weight=None):
        """ Fit ensemble classifers and the meta-classifier.

        Parameters
        ----------
        X : numpy array, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : numpy array, shape = [n_samples]
            Target values.

        groups : numpy array/None, shape = [n_samples]
            The group that each sample belongs to. This is used by specific
            folding strategies such as GroupKFold()

        sample_weight : array-like, shape = [n_samples], optional
            Sample weights passed as sample_weights to each regressor
            in the regressors list as well as the meta_regressor.
            Raises error if some regressor does not support
            sample_weight in the fit() method.

        Returns
        -------
        self : object

        """
        if self.use_clones:
            self.clfs_ = clone(self.classifiers)
            self.meta_clf_ = clone(self.meta_classifier)
        else:
            self.clfs_ = self.classifiers
            self.meta_clf_ = self.meta_classifier
        if self.verbose > 0:
            print("Fitting %d classifiers..." % (len(self.classifiers)))

        final_cv = check_cv(self.cv, y, classifier=self.stratify)
        if isinstance(self.cv, int):
            # Override shuffle parameter in case of self generated
            # cross-validation strategy
            final_cv.shuffle = self.shuffle
            final_cv.random_state = self.random_state

        # Input validation.
        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], dtype=None)

        if sample_weight is None:
            fit_params = None
        else:
            fit_params = dict(sample_weight=sample_weight)

        meta_features = None

        for n, model in enumerate(self.clfs_):

            if self.verbose > 0:
                i = self.clfs_.index(model) + 1
                print("Fitting classifier%d: %s (%d/%d)" %
                      (i, _name_estimators(
                          (model, ))[0][0], i, len(self.clfs_)))

            if self.verbose > 2:
                if hasattr(model, 'verbose'):
                    model.set_params(verbose=self.verbose - 2)

            if self.verbose > 1:
                print(_name_estimators((model, ))[0][1])

            prediction = cross_val_predict(
                model,
                X,
                y,
                groups=groups,
                cv=final_cv,
                n_jobs=self.n_jobs,
                fit_params=fit_params,
                verbose=self.verbose,
                pre_dispatch=self.pre_dispatch,
                method='predict_proba' if self.use_probas else 'predict')

            if not self.use_probas:
                prediction = prediction[:, np.newaxis]
            elif self.drop_last_proba:
                prediction = prediction[:, :-1]

            if meta_features is None:
                meta_features = prediction
            else:
                meta_features = np.column_stack((meta_features, prediction))

        if self.store_train_meta_features:
            self.train_meta_features_ = meta_features

        # Fit the base models correctly this time using ALL the training set
        for model in self.clfs_:
            if sample_weight is None:
                model.fit(X, y)
            else:
                model.fit(X, y, sample_weight=sample_weight)

        # Fit the secondary model
        if self.use_features_in_secondary:
            meta_features = self._stack_first_level_features(X, meta_features)

        if sample_weight is None:
            self.meta_clf_.fit(meta_features, y)
        else:
            self.meta_clf_.fit(meta_features, y, sample_weight=sample_weight)

        return self
    def _fit(self, X, y):
        X, y = check_X_y(X, y, "csr")
        if self.test_data is not None:
            test_data = check_array(self.test_data, "cst")

        # Initialization
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        n_features = X.shape[1]

        estimator = clone(self.estimator)

        # Genetic Algorithm
        toolbox = base.Toolbox()
        init_features = partial(_init_selected_features, n_features=n_features)
        toolbox.register("individual", tools.initIterate, creator.Individual,
                         init_features)
        toolbox.register("population", tools.initRepeat, list,
                         toolbox.individual)
        toolbox.register("evaluate",
                         _eval_function,
                         gaobject=self,
                         estimator=estimator,
                         X=X,
                         y=y,
                         cv=cv,
                         scorer=scorer,
                         verbose=self.verbose,
                         fit_params=self.fit_params,
                         caching=self.caching,
                         test_data=test_data)
        toolbox.register("mate",
                         tools.cxUniform,
                         indpb=self.crossover_independent_proba)
        toolbox.register("mutate",
                         tools.mutFlipBit,
                         indpb=self.mutation_independent_proba)
        toolbox.register("select",
                         tools.selTournament,
                         tournsize=self.tournament_size)

        if self.n_jobs > 1:
            pool = multiprocessing.Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        elif self.n_jobs < 0:
            pool = multiprocessing.Pool(
                processes=max(cpu_count() + 1 + self.n_jobs, 1))
            toolbox.register("map", pool.map)

        pop = toolbox.population(n=self.n_population)
        hof = tools.HallOfFame(5, similar=np.array_equal)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", self.rounded_mean)
        stats.register("std", self.rounded_std)
        stats.register("min", self.rounded_min)
        stats.register("max", self.rounded_max)

        if self.verbose > 0:
            print("Selecting features with genetic algorithm.")

        _, log = algorithms.eaSimple(pop,
                                     toolbox,
                                     cxpb=self.crossover_proba,
                                     mutpb=self.mutation_proba,
                                     ngen=self.n_generations,
                                     stats=stats,
                                     halloffame=hof,
                                     verbose=self.verbose)
        if self.n_jobs != 1:
            pool.close()
            pool.join()

        print('done')
        # Set final attributes
        support_ = np.array(hof, dtype=np.bool)[0]
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X[:, support_], y)

        self.generation_scores_ = np.array(
            [score for score, _ in log.select("max")])
        self.n_features_ = support_.sum()
        self.support_ = support_

        return self
Beispiel #35
0
    def make_imbalance(self, ratio=None, random_state=None):
        """
        Built on the imblearn.make_imbalance function
        :param ratio: dict or list
               Ratio to use for resampling the data set.
               - When 'dict', the keys correspond to the targeted classes. The values correspond to the desired number
                 of samples for each targeted class.
               - When 'list', the values correspond to the proportions of samples (float) assigned to each class. In
                 this case the number of samples is maintained but the samples per class are adjusted to the given
                 proportions.
        :param random_state: int, RandomState instance or None, optional (default=None)
               If int, random_state is the seed used by the random number generator; If RandomState instance,
               random_state is the random number generator; If None, the random number generator is the RandomState
               instance used by `np.random`.
        :return:
        """
        x, y = check_X_y(self.data, self.target)
        original_dataset_size = len(y)
        n_classes = len(self.target_names)

        if isinstance(ratio, dict):
            ratio_ = ratio

        elif isinstance(ratio, list):
            weights = ratio
            if len(weights) != n_classes:
                raise ValueError(
                    "{} classes available but only {} values provided".format(
                        n_classes, len(weights)))
            ratio_ = {}
            for i in range(n_classes):
                ratio_[i] = int(round(weights[i] * original_dataset_size, 0))

        else:
            raise TypeError("Expected dict or list; {} provided".format(
                type(ratio)))

        if sum(ratio_.values()) < original_dataset_size:
            rus = RandomUnderSampler(ratio=ratio_, random_state=random_state)
            self.data, self.target = rus.fit_sample(x, y)

        elif sum(ratio_.values()) == original_dataset_size:
            original_distribution = Counter(y)
            interim_ratio = {}
            for key in ratio_:
                if ratio_[key] >= original_distribution[key]:
                    interim_ratio[key] = original_distribution[key]
                else:
                    interim_ratio[key] = ratio_[key]
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                rus = RandomUnderSampler(ratio=interim_ratio,
                                         random_state=random_state)
                x_int, y_int = rus.fit_sample(x, y)
            with warnings.catch_warnings():
                # Silencing RandomOverSampler UserWarning: After over-sampling, the number of samples in class A will
                # be larger than the number of samples in the majority class
                warnings.simplefilter("ignore")
                ros = RandomOverSampler(ratio=ratio_,
                                        random_state=random_state)
                self.data, self.target = ros.fit_sample(x_int, y_int)

        else:
            raise ValueError(
                "The requested dataset cannot be larger than the original dataset"
            )
Beispiel #36
0
    def fit(self, X, y=None):
        """Derived from https://github.com/fchollet/keras/blob/master/keras/wrappers/scikit_learn.py
        Adds:
        Handling pandas inputs
        Saving of model into the class to allow for easy pickling

        Parameters
        ----------

        X : pandas dataframe or array-like
           training samples
        y : array like, required for array-like X and not used presently for pandas dataframe
           class labels

        Returns
        -------
        self: object

        """
        if isinstance(X, pd.DataFrame):
            df = X
            (X, y, self.vectorizer) = self.convert_numpy(df)
        else:
            check_X_y(X, y)

        input_width = X.shape[1]
        num_classes = len(y.unique())
        logger.info("input_width %d", input_width)
        logger.info("num_classes %d", num_classes)
        train_y = np_utils.to_categorical(y, num_classes)
        self.model = self.model_create(input_width, num_classes)

        if len(y.shape) == 1:
            self.classes_ = list(np.unique(y))
            if self.loss == 'categorical_crossentropy':
                y = to_categorical(y)
        else:
            self.classes_ = np.arange(0, y.shape[1])

        if self.compiled_model_ is None:
            self.compiled_model_ = copy.deepcopy(self.model)
            self.compiled_model_.compile(optimizer=self.optimizer,
                                         loss=self.loss)
        history = self.compiled_model_.fit(
            X,
            y,
            batch_size=self.train_batch_size,
            nb_epoch=self.nb_epoch,
            verbose=self.verbose,
            shuffle=self.shuffle,
            show_accuracy=self.show_accuracy,
            validation_split=self.validation_split,
            validation_data=self.validation_data,
            callbacks=self.callbacks)

        self.config_ = self.model.to_json()
        self.compiled_model_.save_weights(self.tmp_model)
        with open(self.tmp_model,
                  mode='rb') as file:  # b is important -> binary
            self.model_saved = file.read()
        return self
Beispiel #37
0
    def partial_fit(self, X=None, y=None, labels=None, n_features=10):
        if X is None:
            if labels is None:
                raise ValueError("labels should be provided at first call to "
                                 "partial_fit.")
            if n_features is None:
                raise ValueError("n_features should be provided at first call "
                                 "to partial_fit.")
            self.rng_ = check_random_state(self.random_state)
            n_hidden = self.n_hidden
            self.classes_ = labels

            self.wi_ = self.rng_.multivariate_normal(
                np.zeros((n_features + 1) * n_hidden),
                self.prior_scale * np.eye((n_features + 1) * n_hidden),
                size=self.n_iter)

            self.wo_ = self.rng_.multivariate_normal(
                np.zeros(n_hidden * len(labels)),
                self.prior_scale * np.eye(n_hidden * len(self.classes_)),
                size=self.n_iter)

        else:
            n_hidden = self.n_hidden
            X, y = check_X_y(X, y)
            n_features = self.wi_.shape[1] // n_hidden

            samples_i = np.zeros((self.n_iter, n_features, n_hidden))
            samples_o = np.zeros((self.n_iter, n_hidden, len(self.classes_)))
            weights = np.zeros(self.n_iter)
            cov_i = self.scale * np.eye(n_features * n_hidden)
            cov_o = self.scale * np.eye(n_hidden * len(self.classes_))

            for i in range(self.n_iter):
                s_i = self.rng_.multivariate_normal(self.wi_[i], cov_i)
                samples_i[i] = s_i.reshape(n_features, n_hidden)
                s_o = self.rng_.multivariate_normal(self.wo_[i], cov_o)
                samples_o[i] = s_o.reshape(n_hidden, len(self.classes_))

                reg = self.alpha * (np.dot(s_i, s_i) + np.dot(s_o, s_o))
                loss = -log_loss(y,
                                 self.forward(X, samples_i[i], samples_o[i]),
                                 labels=self.classes_)

                weights[i] = loss - reg

            self.samples_i_ = samples_i
            self.samples_o_ = samples_o
            self.weights_ = softmax_1D(weights)

            self.multi_ = self.rng_.multinomial(self.n_iter, self.weights_)
            resampled = np.repeat(np.arange(self.n_iter), self.multi_)

            self.wi_ = self.wi_[resampled]
            self.wo_ = self.wo_[resampled]

            if self.local not in [None, "mh", "basinhopping"]:
                raise ValueError(
                    "local should be one of None, mh or basinhopping")
            if self.local == "mh":
                for i in range(self.n_iter):
                    self.wi_[i], self.wo_[i] = self.mh_step(
                        X, y, self.wi_[i], self.wo_[i])
            elif self.local == "basinhopping":
                wi_len = len(self.wi_[0])
                for i in range(self.n_iter):
                    x0 = np.concatenate((self.wi_[i], self.wo_[i]))

                    opt_func = partial(log_likelihood, X=X, y=y, mlp=self)
                    res = basinhopping(opt_func, x0)
                    self.wi_[i], self.wo_[i] = res.x[:wi_len], res.x[wi_len:]

            self.coef_i_ = np.mean(self.wi_.reshape(self.n_iter, n_features,
                                                    n_hidden),
                                   axis=0)
            self.coef_o_ = np.mean(self.wo_.reshape(self.n_iter, n_hidden,
                                                    len(self.classes_)),
                                   axis=0)
            return self
Beispiel #38
0
    def _batch_fit(self, X, y, check_input=False):
        print('Batch fit')
        if check_input:
            X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self)

        current_n_samples, n_features = X.shape
        # Update stats - they are 0 if this is the first step
        updated_mean, updated_var, updated_n_samples_seen_ = _incremental_mean_and_var(
            X,
            last_mean=self.mean_,
            last_variance=self.var_,
            last_sample_count=self.n_samples_seen_)
        # Whitening
        if self.n_samples_seen_ == 0:
            # If it is the first step, simply whiten X
            X = np.subtract(X, updated_mean)
        else:
            col_batch_mean = np.mean(X, axis=0)
            X = np.subtract(X, col_batch_mean)

        # Updating algorithm
        # First update class means
        updated_class_mean = self.class_mean_
        updated_class_n_samples_seen_ = self.class_n_samples_seen_
        # print('updated_class_n_samples_seen_', updated_class_n_samples_seen_)
        # print('updated_class_mean', updated_class_mean)
        for i, current_class in enumerate(self.classes_):
            current_class_samples = X[y == current_class, :]
            n_current_class_samples = current_class_samples.shape[0]
            previous_n_class_samples = updated_class_n_samples_seen_[i]
            if n_current_class_samples > 0 and previous_n_class_samples > 0:
                previous_class_sum_current_class = updated_class_mean[
                    i, :] * updated_class_n_samples_seen_[i]
                current_class_sum_current_class = np.sum(current_class_samples,
                                                         axis=0)

                # print('previous_class_sum_current_class.shape', previous_class_sum_current_class.shape)
                # print('current_class_sum_current_class.shape', current_class_sum_current_class.shape)
                # print('updated_class_mean.shape', updated_class_mean.shape)
                # print('updated_class_n_samples_seen_.shape', updated_class_n_samples_seen_[i])

                updated_class_n_samples_seen_[i] += n_current_class_samples
                updated_class_mean[i, :] = (previous_class_sum_current_class + current_class_sum_current_class) /\
                                            previous_n_class_samples
            elif n_current_class_samples > 0:
                updated_class_mean[i, :] = np.mean(current_class_samples,
                                                   axis=0)
                updated_class_n_samples_seen_[i] = n_current_class_samples

        # Then update between class scatter
        updated_between_scatter = self.between_scatter
        for i, current_class_mean in enumerate(updated_class_mean):
            n = X[y == self.classes_[i], :].shape[0]
            current_class_mean = current_class_mean.reshape(1, n_features)
            updated_mean = updated_mean.reshape(1, n_features)
            if n > 0:
                updated_between_scatter += n * (
                    current_class_mean -
                    updated_mean).T.dot(current_class_mean - updated_mean)

        # if np.any(np.isnan(updated_between_scatter)):
        #     print('Reached nan:::: ', n)
        #     print('Updatec class mean:::', updated_class_mean)
        #     print('updated mean::::', updated_mean)

        updated_class_within_scatter = self.class_within_scatter
        for i, current_class_mean in enumerate(updated_class_mean):
            current_class_samples = X[y == self.classes_[i], :]
            n_current_class_samples = current_class_samples.shape[0]
            l_c = current_class_samples.shape[0]
            n_c = self.class_n_samples_seen_[i]
            mean_y_c = np.reshape(np.mean(current_class_samples, axis=0),
                                  (n_features, 1))

            if n_current_class_samples > 0 and n_c > 0:
                # print('current_class_samples.shape', current_class_samples.shape)
                mean_x_c = np.reshape(self.class_mean_[i, :], (n_features, 1))

                D_c = (mean_y_c - mean_x_c).dot((mean_y_c - mean_x_c).T)

                E_c = np.zeros(D_c.shape)
                for current_samples, j in enumerate(current_class_samples):
                    E_c += (current_samples - mean_x_c).dot(
                        (current_samples - mean_x_c).T)

                F_c = np.zeros(D_c.shape)
                for current_samples, j in enumerate(current_class_samples):
                    F_c += (current_samples - mean_y_c).dot(
                        (current_samples - mean_y_c).T)

                updated_class_within_scatter[:, :, i] += ((n_c * l_c * l_c) * D_c / np.square(n_c + l_c)) + \
                                                         ((np.square(n_c) * E_c) / np.square(n_c + l_c)) + \
                                                         ((l_c * (l_c + (2 * n_c)) * F_c) / np.square(n_c + l_c))
            elif n_current_class_samples > 0:
                updated_class_within_scatter[:, :,
                                             i] = (current_class_samples -
                                                   mean_y_c).dot(
                                                       (current_class_samples -
                                                        mean_y_c).T)
        updated_within_scatter = np.sum(updated_class_within_scatter, axis=2)

        # Final values after computation
        self.n_samples_seen_ = updated_n_samples_seen_
        self.class_n_samples_seen_ = updated_class_n_samples_seen_
        self.mean_ = updated_mean
        self.class_mean_ = updated_class_mean
        self.var_ = updated_var
        self.between_scatter = updated_between_scatter
        self.within_scatter = updated_within_scatter
        self.class_within_scatter = updated_class_within_scatter
Beispiel #39
0
 def _single_fit(self, X, y, check_input=False):
     print('Single Fit')
     if check_input:
         X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self)
Beispiel #40
0
    def fit(self, X, y):
        if self.model is not None:
            thundersvm.model_free(c_void_p(self.model))
            self.model = None
        sparse = sp.isspmatrix(X)
        self._sparse = sparse and not callable(self.kernel)
        X, y = check_X_y(X,
                         y,
                         dtype=np.float64,
                         order='C',
                         accept_sparse='csr')
        y = self.label_validate(y)

        solver_type = SVM_TYPE.index(self._impl)

        if self.gamma == 'auto':
            self._gamma = 1.0 / X.shape[1]
        else:
            self._gamma = self.gamma
        if self.kernel not in KERNEL_TYPE:
            print(
                "The kernel parameter not recognized, please refer to the document."
            )
            exit()
        else:
            kernel = KERNEL_TYPE.index(self.kernel)

        fit = self._sparse_fit if self._sparse else self._dense_fit
        thundersvm.model_new.restype = c_void_p
        self.model = thundersvm.model_new(solver_type)
        if self.max_mem_size != -1:
            thundersvm.set_memory_size(c_void_p(self.model), self.max_mem_size)
        fit(X, y, solver_type, kernel)
        if self._train_succeed[0] == -1:
            print("Training failed!")
            return
        self.n_sv = thundersvm.n_sv(c_void_p(self.model))
        csr_row = (c_int * (self.n_sv + 1))()
        csr_col = (c_int * (self.n_sv * self.n_features))()
        csr_data = (c_float * (self.n_sv * self.n_features))()
        data_size = (c_int * 1)()
        thundersvm.get_sv(csr_row, csr_col, csr_data, data_size,
                          c_void_p(self.model))
        dual_coef = (c_float * ((self.n_classes - 1) * self.n_sv))()
        thundersvm.get_coef(dual_coef, self.n_classes, self.n_sv,
                            c_void_p(self.model))

        self.dual_coef_ = np.array([
            dual_coef[index]
            for index in range(0, (self.n_classes - 1) * self.n_sv)
        ]).astype(float)
        self.dual_coef_ = np.reshape(self.dual_coef_,
                                     (self.n_classes - 1, self.n_sv))

        rho_size = int(self.n_classes * (self.n_classes - 1) / 2)
        self.n_binary_model = rho_size
        rho = (c_float * rho_size)()
        thundersvm.get_rho(rho, rho_size, c_void_p(self.model))

        if self.kernel == 'linear':
            coef = (c_float * (self.n_binary_model * self.n_sv))()
            thundersvm.get_linear_coef(coef, self.n_binary_model,
                                       self.n_features, c_void_p(self.model))
            self.coef_ = np.array([
                coef[index]
                for index in range(0, self.n_binary_model * self.n_features)
            ]).astype(float)
            self.coef_ = np.reshape(self.coef_,
                                    (self.n_binary_model, self.n_features))

        self.intercept_ = np.array(
            [rho[index] for index in range(0, rho_size)]).astype(float)

        self.row = np.array(
            [csr_row[index] for index in range(0, self.n_sv + 1)])
        self.col = np.array(
            [csr_col[index] for index in range(0, data_size[0])])
        self.data = np.array(
            [csr_data[index] for index in range(0, data_size[0])])

        self.support_vectors_ = sp.csr_matrix((self.data, self.col, self.row))
        if self._sparse == False:
            self.support_vectors_ = self.support_vectors_.toarray(order='C')
        n_support_ = (c_int * self.n_classes)()
        thundersvm.get_support_classes(n_support_, self.n_classes,
                                       c_void_p(self.model))

        self.n_support_ = np.array([
            n_support_[index] for index in range(0, self.n_classes)
        ]).astype(int)

        self.shape_fit_ = X.shape

        return self
Beispiel #41
0
def make_imbalance(X,
                   y,
                   sampling_strategy=None,
                   random_state=None,
                   verbose=False,
                   **kwargs):
    """Turns a dataset into an imbalanced dataset with a specific sampling
    strategy.

    A simple toy dataset to visualize clustering and classification
    algorithms.

    Read more in the :ref:`User Guide <make_imbalanced>`.

    Parameters
    ----------
    X : ndarray, shape (n_samples, n_features)
        Matrix containing the data to be imbalanced.

    y : ndarray, shape (n_samples, )
        Corresponding label for each sample in X.

    sampling_strategy : dict, or callable,
        Ratio to use for resampling the data set.

        - When ``dict``, the keys correspond to the targeted classes. The
          values correspond to the desired number of samples for each targeted
          class.

        - When callable, function taking ``y`` and returns a ``dict``. The keys
          correspond to the targeted classes. The values correspond to the
          desired number of samples for each class.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by np.random.

    verbose : bool, optional (default=False)
        Show information regarding the sampling.

    kwargs : dict, optional
        Dictionary of additional keyword arguments to pass to
        ``sampling_strategy``.

    Returns
    -------
    X_resampled : ndarray, shape (n_samples_new, n_features)
        The array containing the imbalanced data.

    y_resampled : ndarray, shape (n_samples_new)
        The corresponding label of `X_resampled`

    Notes
    -----
    See
    :ref:`sphx_glr_auto_examples_applications_plot_multi_class_under_sampling.py`,
    :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py`, and
    :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`.

    Examples
    --------
    >>> from collections import Counter
    >>> from sklearn.datasets import load_iris
    >>> from imblearn.datasets import make_imbalance

    >>> data = load_iris()
    >>> X, y = data.data, data.target
    >>> print('Distribution before imbalancing: {}'.format(Counter(y)))
    Distribution before imbalancing: Counter({0: 50, 1: 50, 2: 50})
    >>> X_res, y_res = make_imbalance(X, y,
    ...                               sampling_strategy={0: 10, 1: 20, 2: 30},
    ...                               random_state=42)
    >>> print('Distribution after imbalancing: {}'.format(Counter(y_res)))
    Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10})

    """
    X, y = check_X_y(X, y)
    target_stats = Counter(y)
    # restrict ratio to be a dict or a callable
    if isinstance(sampling_strategy, dict) or callable(sampling_strategy):
        sampling_strategy_ = check_sampling_strategy(sampling_strategy, y,
                                                     "under-sampling",
                                                     **kwargs)
    else:
        raise ValueError(
            "'sampling_strategy' has to be a dictionary or a "
            "function returning a dictionary. Got {} instead.".format(
                type(sampling_strategy)))

    if verbose:
        print(
            "The original target distribution in the dataset is: %s",
            target_stats,
        )
    rus = RandomUnderSampler(
        sampling_strategy=sampling_strategy_,
        replacement=False,
        random_state=random_state,
    )
    X_resampled, y_resampled = rus.fit_resample(X, y)
    if verbose:
        print("Make the dataset imbalanced: %s", Counter(y_resampled))

    return X_resampled, y_resampled
Beispiel #42
0
    def _partial_fit(self,
                     X,
                     y,
                     classes=None,
                     _refit=False,
                     sample_weight=None):
        X, y = check_X_y(X, y)

        # If the ratio of data variance between dimensions is too small, it
        # will cause numerical errors. To address this, we artificially
        # boost the variance by epsilon, a small fraction of the standard
        # deviation of the largest dimension.
        epsilon = 1e-9 * np.var(X, axis=0).max()

        if _refit:
            self.classes_ = None

        if _check_partial_fit_first_call(self, classes):
            # This is the first call to partial_fit:
            # initialize various cumulative counters
            n_features = X.shape[1]
            n_classes = len(self.classes_)
            self.theta_ = np.zeros((n_classes, n_features))
            self.sigma_ = np.zeros((n_classes, n_features))
            self.class_prior_ = np.zeros(n_classes)
            self.class_count_ = np.zeros(n_classes)
        else:
            if X.shape[1] != self.theta_.shape[1]:
                msg = "Number of features %d does not match previous data %d."
                raise ValueError(msg % (X.shape[1], self.theta_.shape[1]))
            # Put epsilon back in each time
            self.sigma_[:, :] -= epsilon

        classes = self.classes_

        unique_y = np.unique(y)
        unique_y_in_classes = in1d(unique_y, classes)

        if not np.all(unique_y_in_classes):
            raise ValueError("The target label(s) %s in y do not exist in the "
                             "initial classes %s" %
                             (y[~unique_y_in_classes], classes))

        for y_i in unique_y:
            i = classes.searchsorted(y_i)
            X_i = X[y == y_i, :]

            if sample_weight is not None:
                sw_i = sample_weight[y == y_i]
                N_i = sw_i.sum()
            else:
                sw_i = None
                N_i = X_i.shape[0]

            new_theta, new_sigma = self._update_mean_variance(
                self.class_count_[i], self.theta_[i, :], self.sigma_[i, :],
                X_i, sw_i)

            self.theta_[i, :] = new_theta
            self.sigma_[i, :] = new_sigma
            self.class_count_[i] += N_i

        self.sigma_[:, :] += epsilon
        self.class_prior_[:] = self.class_count_ / np.sum(self.class_count_)
        #print self.class_prior_[:]
        return self
def test_check_array_min_samples_and_features_messages():
    # empty list is considered 2D by default:
    msg = "0 feature(s) (shape=(1, 0)) while a minimum of 1 is required."
    assert_raise_message(ValueError, msg, check_array, [[]])

    # If considered a 1D collection when ensure_2d=False, then the minimum
    # number of samples will break:
    msg = "0 sample(s) (shape=(0,)) while a minimum of 1 is required."
    assert_raise_message(ValueError, msg, check_array, [], ensure_2d=False)

    # Invalid edge case when checking the default minimum sample of a scalar
    msg = "Singleton array array(42) cannot be considered a valid collection."
    assert_raise_message(TypeError, msg, check_array, 42, ensure_2d=False)

    # Simulate a model that would need at least 2 samples to be well defined
    X = np.ones((1, 10))
    y = np.ones(1)
    msg = "1 sample(s) (shape=(1, 10)) while a minimum of 2 is required."
    assert_raise_message(ValueError,
                         msg,
                         check_X_y,
                         X,
                         y,
                         ensure_min_samples=2)

    # The same message is raised if the data has 2 dimensions even if this is
    # not mandatory
    assert_raise_message(ValueError,
                         msg,
                         check_X_y,
                         X,
                         y,
                         ensure_min_samples=2,
                         ensure_2d=False)

    # Simulate a model that would require at least 3 features (e.g. SelectKBest
    # with k=3)
    X = np.ones((10, 2))
    y = np.ones(2)
    msg = "2 feature(s) (shape=(10, 2)) while a minimum of 3 is required."
    assert_raise_message(ValueError,
                         msg,
                         check_X_y,
                         X,
                         y,
                         ensure_min_features=3)

    # Only the feature check is enabled whenever the number of dimensions is 2
    # even if allow_nd is enabled:
    assert_raise_message(ValueError,
                         msg,
                         check_X_y,
                         X,
                         y,
                         ensure_min_features=3,
                         allow_nd=True)

    # Simulate a case where a pipeline stage as trimmed all the features of a
    # 2D dataset.
    X = np.empty(0).reshape(10, 0)
    y = np.ones(10)
    msg = "0 feature(s) (shape=(10, 0)) while a minimum of 1 is required."
    assert_raise_message(ValueError, msg, check_X_y, X, y)

    # nd-data is not checked for any minimum number of features by default:
    X = np.ones((10, 0, 28, 28))
    y = np.ones(10)
    X_checked, y_checked = check_X_y(X, y, allow_nd=True)
    assert_array_equal(X, X_checked)
    assert_array_equal(y, y_checked)
Beispiel #44
0
    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """

        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        self.min_c_ = None
        self.maj_c_ = None
        self.stats_c_ = {}
        self.X_shape_ = None

        if hasattr(self, 'ratio'):
            self._validate_ratio()

        if hasattr(self, 'size_ngh'):
            self._validate_size_ngh_deprecation()
        elif hasattr(self, 'k') and not hasattr(self, 'm'):
            self._validate_k_deprecation()
        elif hasattr(self, 'k') and hasattr(self, 'm'):
            self._validate_k_m_deprecation()

        self.logger.info('Compute classes statistics ...')

        # Raise an error if there is only one class
        # if uniques.size == 1:
        #     raise RuntimeError("Only one class detected, aborting...")
        # Raise a warning for the moment to be compatible with BaseEstimator
        self.logger.debug('The number of classes is %s', np.unique(y).size)
        self.logger.debug('Shall we raise a warning: %s',
                          np.unique(y).size == 1)
        if np.unique(y).size == 1:
            warnings.simplefilter('always', UserWarning)
            warnings.warn('Only one class detected, something will get wrong')
            self.logger.debug('The warning should has been raised.')

        # Store the size of X to check at sampling time if we have the
        # same data
        self.X_shape_ = X.shape

        # Create a dictionary containing the class statistics
        self.stats_c_ = Counter(y)

        # Find the minority and majority classes
        self.min_c_ = min(self.stats_c_, key=self.stats_c_.get)
        self.maj_c_ = max(self.stats_c_, key=self.stats_c_.get)

        self.logger.info('%s classes detected: %s',
                         np.unique(y).size, self.stats_c_)

        # Check if the ratio provided at initialisation make sense
        if isinstance(self.ratio, float):
            if self.ratio < (self.stats_c_[self.min_c_] /
                             self.stats_c_[self.maj_c_]):
                raise RuntimeError('The ratio requested at initialisation'
                                   ' should be greater or equal than the'
                                   ' balancing ratio of the current data.')

        return self
def _eval_function(individual,
                   gaobject,
                   estimator,
                   X,
                   y,
                   cv,
                   scorer,
                   verbose,
                   fit_params,
                   caching,
                   test_data=None):
    individual_sum = np.sum(individual, axis=0)
    if individual_sum == 0:
        return -10000, individual_sum
    individual_tuple = tuple(individual)
    if caching and individual_tuple in gaobject.scores_cache:
        return gaobject.scores_cache[individual_tuple], individual_sum
    x_selected = X[:, np.array(individual, dtype=np.bool)]

    scores = []
    x_holdout_selected = []

    if fit_params['eval_set'] is not None:
        eval_set_params = copy.deepcopy(fit_params)
        for i, valid_data in enumerate(eval_set_params['eval_set']):
            x_holdout, y_holdout = check_X_y(valid_data[0], valid_data[1],
                                             "csr")
            x_holdout_selected = x_holdout[:,
                                           np.array(individual, dtype=np.bool)]
            eval_set_params['eval_set'][i][0] = x_holdout_selected
            eval_set_params['eval_set'][i][1] = y_holdout
    else:
        eval_set_params = fit_params

    fold = 0
    x_test_selected = oof_test = oof_train = oof_test_skf = oof_holdout = None
    if test_data is not None:
        x_test_selected = test_data[:, np.array(individual, dtype=np.bool)]
        oof_train = np.zeros((x_selected.shape[0], ))
        oof_holdout = np.empty(
            (cv.get_n_splits(), x_holdout_selected.shape[0]))
        oof_test = np.zeros((x_test_selected.shape[0], ))
        oof_test_skf = np.empty((cv.get_n_splits(), x_test_selected.shape[0]))

    fit_time = score_time = 0
    start_time = time.time()

    for train, test in cv.split(x_selected, y):
        x_selected_test, y_test = check_X_y(x_selected[test], y[test], "csr")
        eval_set_params['eval_set'].append([x_selected_test, y_test])
        eval_set_params['eval_names'].append('cv-valid')

        print('reset estimator')
        estimator = clone(estimator)

        score = _fit_and_score(estimator=estimator,
                               X=x_selected,
                               y=y,
                               scorer=scorer,
                               train=train,
                               test=test,
                               verbose=verbose,
                               parameters=None,
                               fit_params=eval_set_params)

        # cleanup for the next round
        del eval_set_params['eval_names'][1]
        del eval_set_params['eval_set'][1], x_selected_test, y_test
        gc.collect()

        fit_time = time.time() - start_time
        print('Learning done in {:f} seconds'.format(fit_time))
        scores.append(score)

        # if it is not empty - we want oof predictions
        if test_data is not None:
            oof_train[test] = estimator.booster_.predict(
                x_selected[test], num_iteration=estimator.best_iteration_)
            oof_test_skf[fold, :] = estimator.booster_.predict(
                x_test_selected, num_iteration=estimator.best_iteration_)
            oof_holdout[fold, :] = estimator.booster_.predict(
                x_holdout_selected, num_iteration=estimator.best_iteration_)

            fold += 1

        score_time = time.time() - start_time - fit_time
        print('predicting done in {:f} seconds'.format(score_time))

    total_time = score_time + fit_time

    print('individual done in {:f} seconds'.format(total_time))

    scores_mean = np.mean(scores)
    scores_std = np.std(scores)
    data_dict = {}

    if test_data is not None:
        oof_test[:] = oof_test_skf.mean(axis=0)
        oof_train = oof_train.reshape(-1, 1)
        oof_test = oof_test.reshape(-1, 1)

        data_dict = {
            'holdout_score': float(estimator.best_score_['oof']['auc']),
            'holdout_prediction_folds': oof_holdout,
            'estimator_scores': estimator.best_score_,
            'oof_test_folds': oof_test_skf,
            'oof_train': oof_train,
            'oof_test_mean': oof_test,
            'estimator_params': estimator.get_params(),
            'estimator_feature_importance': estimator.feature_importances_,
            'estimator_best_iteration': int(estimator.best_iteration_),
            'estimator_n_features_': estimator.n_features_,
            'original_n_features': X.shape[0],
            'cv_scores': scores,
            'cv_score': scores_mean,
            'cv_score_std': scores_std,
            'folds': fold,
            'individual': individual,
            'individual_hash': str(hash(tuple(individual))),
            'time': time.time()
        }
        del scores, oof_test_skf, oof_test, oof_train, eval_set_params
        del x_test_selected
        gc.collect()

        name = '{:.5f}_{:d}_{:.4f}_{:.4f}_{}_oof_data'.format(
            data_dict['holdout_score'], data_dict['estimator_n_features_'],
            data_dict['cv_score'], data_dict['cv_score_std'],
            data_dict['individual_hash'])

        save_oof_predictions(name, data_dict)

    if caching:
        gaobject.scores_cache[individual_tuple] = scores_mean
        filename = os.path.join(os.getcwd(), 'cache.z')
        joblib.dump(gaobject.scores_cache, filename, compress=True)
        del filename

    print(80 * '=')
    print(80 * '=')
    print('Individual scored')
    print('holdout-score: {:.5f}'.format(data_dict['holdout_score']))
    print('cv-score     : {:.5f}'.format(data_dict['cv_score']))
    print('n_features   : {:6d}'.format(data_dict['estimator_n_features_']))
    print(80 * '=')
    print(80 * '=')
    del data_dict
    gc.collect()

    return scores_mean, individual_sum
Beispiel #46
0
    def fit(self, X, y, sample_weight=None, relative_penalties=None):
        """Fit the model to training data. If n_splits > 1 also run n-fold cross
        validation on all values in lambda_path.

        The model will be fit n+1 times. On the first pass, the lambda_path
        will be determined, on the remaining passes, the model performance for
        each value of lambda. After cross validation, the attribute
        `cv_mean_score_` will contain the mean score over all folds for each
        value of lambda, and `cv_standard_error_` will contain the standard
        error of `cv_mean_score_` for each value of lambda. The value of lambda
        which achieves the best performance in cross validation will be saved
        to `lambda_max_` additionally, the largest value of lambda s.t.:
            cv_score(l) >= cv_score(lambda_max_) -\
                           cut_point * standard_error(lambda_max_)
        will be saved to `lambda_best_`.

        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            Input features

        Y : array, shape (n_samples,)
            Target values

        sample_weight : array, shape (n_samples,)
            Optional weight vector for observations

        relative_penalties: array, shape (n_features,)
            Optional relative weight vector for penalty.
            0 entries remove penalty.

        Returns
        -------
        self : object
            Returns self.
        """

        if self.alpha > 1 or self.alpha < 0:
            raise ValueError("alpha must be between 0 and 1")

        if self.n_splits > 0 and self.n_splits < 3:
            raise ValueError("n_splits must be at least 3")

        X, y = check_X_y(X, y, accept_sparse='csr', ensure_min_samples=2)
        if sample_weight is None:
            sample_weight = np.ones(X.shape[0])

        self._fit(X, y, sample_weight, relative_penalties)

        if self.n_splits >= 3:
            cv_scores = _score_lambda_path(self,
                                           X,
                                           y,
                                           sample_weight,
                                           relative_penalties,
                                           self.n_splits,
                                           self.scoring,
                                           classifier=False,
                                           n_jobs=self.n_jobs,
                                           verbose=self.verbose)

            self.cv_mean_score_ = np.atleast_1d(np.mean(cv_scores, axis=0))
            self.cv_standard_error_ = np.atleast_1d(stats.sem(cv_scores))

            self.lambda_max_inx_ = np.argmax(self.cv_mean_score_)
            self.lambda_max_ = self.lambda_path_[self.lambda_max_inx_]

            target_score = self.cv_mean_score_[self.lambda_max_inx_] -\
                self.cut_point * self.cv_standard_error_[self.lambda_max_inx_]

            self.lambda_best_inx_ = np.argwhere(
                self.cv_mean_score_ >= target_score)[0]
            self.lambda_best_ = self.lambda_path_[self.lambda_best_inx_]

            self.coef_ = self.coef_path_[..., self.lambda_best_inx_]
            self.coef_ = self.coef_.squeeze(axis=self.coef_.ndim - 1)
            self.intercept_ = self.intercept_path_[
                ..., self.lambda_best_inx_].squeeze()
            if self.intercept_.shape == ():  # convert 0d array to scalar
                self.intercept_ = float(self.intercept_)

        return self
Beispiel #47
0
def discr_stat(X,
               Y,
               dissimilarity="euclidean",
               remove_isolates=True,
               return_rdfs=True):
    """
    Computes the discriminability statistic.

    Parameters
    ----------
    X : array, shape (n_samples, n_features) or (n_samples, n_samples)
        Input data. If dissimilarity=='precomputed', the input should be the
         dissimilarity matrix.
    Y : 1d-array, shape (n_samples)
        Input labels.
    dissimilarity : str, {"euclidean" (default), "precomputed"} Dissimilarity
        measure can be 'euclidean' (pairwise Euclidean distances between points
        in the dataset) or 'precomputed' (pre-computed dissimilarities).
    remove_isolates : bool, optional, default=True
        Whether to remove data that have single label.
    return_rdfs : bool, optional, default=False
        Whether to return rdf for all data points.

    Returns
    -------
    stat : float
        Discriminability statistic.
    rdfs : array, shape (n_samples, max{len(id)})
        Rdfs for each sample. Only returned if ``return_rdfs==True``.

    """
    check_X_y(X, Y, accept_sparse=True)

    uniques, counts = np.unique(Y, return_counts=True)
    if remove_isolates:
        idx = np.isin(Y, uniques[counts != 1])
        labels = Y[idx]

        if (dissimilarity == "euclidean" or dissimilarity == "cosine"
                or dissimilarity == "haversine" or dissimilarity == "manhattan"
                or dissimilarity == "mahalanobis"):
            X = X[idx]
        else:
            X = X[np.ix_(idx, idx)]
    else:
        labels = Y

    if dissimilarity == "euclidean":
        dissimilarities = euclidean_distances(X)
    elif dissimilarity == "cosine":
        dissimilarities = cosine_distances(X)
    elif dissimilarity == "haversine":
        dissimilarities = haversine_distances(X)
    elif dissimilarity == "manhattan":
        dissimilarities = manhattan_distances(X)
    else:
        dissimilarities = X

    rdfs = _discr_rdf(dissimilarities, labels)
    rdfs[rdfs < 0.5] = np.nan
    stat = np.nanmean(rdfs)

    if return_rdfs:
        return stat, rdfs
    else:
        return stat
Beispiel #48
0
    def fit(self, X, y, labels=None, neighbors=None):
        """Generate the intra-label and inter-label distribution.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        labels : array-like, shape (n_samples,)
            Labels of each sample.

        neighbors : array-like, (n_neighboring_pairs, 2)
            An array that contains all neighboring pairs. Each row is
            a unique neighboring pair.

        Returns
        -------
        self : object,
            Return self.

        """

        # Check data
        X, y = check_X_y(X, y, dtype=None)

        # Set statistics
        self.majority_class_label_ = Counter(y).most_common()[0][0]
        self.unique_cluster_labels_ = (
            np.unique(labels) if labels is not None else np.array(0, dtype=int)
        )
        self.unique_class_labels_ = np.unique(y)
        self.n_samples_ = len(X)

        # Set default attributes
        self.labels_ = (
            np.repeat(0, len(X))
            if labels is None
            else check_array(labels, ensure_2d=False)
        )
        self.neighbors_ = (
            np.empty((0, 2), dtype=int)
            if neighbors is None
            else check_array(neighbors, ensure_2d=False)
        )
        self.intra_distribution_ = {
            (0, class_label): 1.0
            for class_label in np.unique(y)
            if class_label != self.majority_class_label_
        }
        self.inter_distribution_ = {}

        # Fit distributor
        self._fit(X, y, labels, neighbors)

        # Validate fitting procedure
        self._validate_fitting()

        return self
Beispiel #49
0
    def fit(self, X, Y):
        """Fit the model to data matrix X and targets Y.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.
        Y : array-like, shape (n_samples, n_classes)
            The target values.

        Returns
        -------
        self : object
            Returns self.
        """
        X, Y = check_X_y(X, Y, multi_output=True, accept_sparse=True)

        random_state = check_random_state(self.random_state)
        check_array(X, accept_sparse=True)
        self.order_ = self.order
        if self.order_ is None:
            self.order_ = np.array(range(Y.shape[1]))
        elif isinstance(self.order_, str):
            if self.order_ == 'random':
                self.order_ = random_state.permutation(Y.shape[1])
        elif sorted(self.order_) != list(range(Y.shape[1])):
                raise ValueError("invalid order")

        self.estimators_ = [clone(self.base_estimator)
                            for _ in range(Y.shape[1])]

        self.classes_ = []

        if self.cv is None:
            Y_pred_chain = Y[:, self.order_]
            if sp.issparse(X):
                X_aug = sp.hstack((X, Y_pred_chain), format='lil')
                X_aug = X_aug.tocsr()
            else:
                X_aug = np.hstack((X, Y_pred_chain))

        elif sp.issparse(X):
            Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1]))
            X_aug = sp.hstack((X, Y_pred_chain), format='lil')

        else:
            Y_pred_chain = np.zeros((X.shape[0], Y.shape[1]))
            X_aug = np.hstack((X, Y_pred_chain))

        del Y_pred_chain

        for chain_idx, estimator in enumerate(self.estimators_):
            y = Y[:, self.order_[chain_idx]]
            estimator.fit(X_aug[:, :(X.shape[1] + chain_idx)], y)
            if self.cv is not None and chain_idx < len(self.estimators_) - 1:
                col_idx = X.shape[1] + chain_idx
                cv_result = cross_val_predict(
                    self.base_estimator, X_aug[:, :col_idx],
                    y=y, cv=self.cv)
                if sp.issparse(X_aug):
                    X_aug[:, col_idx] = np.expand_dims(cv_result, 1)
                else:
                    X_aug[:, col_idx] = cv_result

            self.classes_.append(estimator.classes_)
        return self
def visualize(clf_name,
              X_train,
              y_train,
              X_test,
              y_test,
              y_train_pred,
              y_test_pred,
              show_figure=True,
              save_figure=False):  # pragma: no cover
    """
    Utility function for visualizing the results in examples
    Internal use only

    :param clf_name: The name of the detector
    :type clf_name: str

    :param X_train: The training samples
    :param X_train: numpy array of shape (n_samples, n_features)

    :param y_train: The ground truth of training samples
    :type y_train: list or array of shape (n_samples,)

    :param X_test: The test samples
    :type X_test: numpy array of shape (n_samples, n_features)

    :param y_test: The ground truth of test samples
    :type y_test: list or array of shape (n_samples,)

    :param y_train_pred: The predicted outlier scores on the training samples
    :type y_train_pred: numpy array of shape (n_samples, n_features)

    :param y_test_pred: The predicted outlier scores on the test samples
    :type y_test_pred: numpy array of shape (n_samples, n_features)

    :param show_figure: If set to True, show the figure
    :type show_figure: bool, optional (default=True)

    :param save_figure: If set to True, save the figure to the local
    :type save_figure: bool, optional (default=False)
    """

    if X_train.shape[1] != 2 or X_test.shape[1] != 2:
        raise ValueError("Input data has to be 2-d for visualization. The "
                         "input data has {shape}.".format(shape=X_train.shape))

    X_train, y_train = check_X_y(X_train, y_train)
    X_test, y_test = check_X_y(X_test, y_test)
    c_train = get_color_codes(y_train)
    c_test = get_color_codes(y_test)

    fig = plt.figure(figsize=(12, 10))
    plt.suptitle("Demo of {clf_name}".format(clf_name=clf_name))

    fig.add_subplot(221)
    plt.scatter(X_train[:, 0], X_train[:, 1], c=c_train)
    plt.title('Train ground truth')
    legend_elements = [
        Line2D([0], [0],
               marker='o',
               color='w',
               label='normal',
               markerfacecolor='b',
               markersize=8),
        Line2D([0], [0],
               marker='o',
               color='w',
               label='outlier',
               markerfacecolor='r',
               markersize=8)
    ]

    plt.legend(handles=legend_elements, loc=4)

    fig.add_subplot(222)
    plt.scatter(X_test[:, 0], X_test[:, 1], c=c_test)
    plt.title('Test ground truth')
    plt.legend(handles=legend_elements, loc=4)

    fig.add_subplot(223)
    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train_pred)
    plt.title('Train prediction by {clf_name}'.format(clf_name=clf_name))
    legend_elements = [
        Line2D([0], [0],
               marker='o',
               color='w',
               label='normal',
               markerfacecolor='0',
               markersize=8),
        Line2D([0], [0],
               marker='o',
               color='w',
               label='outlier',
               markerfacecolor='yellow',
               markersize=8)
    ]
    plt.legend(handles=legend_elements, loc=4)

    fig.add_subplot(224)
    plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test_pred)
    plt.title('Test prediction by {clf_name}'.format(clf_name=clf_name))
    plt.legend(handles=legend_elements, loc=4)

    if save_figure:
        plt.savefig('{clf_name}.png'.format(clf_name=clf_name), dpi=300)
    if show_figure:
        plt.show()
    return
    def fit(self, X, y, feature_labels=None):  # -1 for unlabeled
        """Fit rule lists to data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data 

        y : array_like, shape = [n_samples]
            Labels
            
        feature_labels : array_like, shape = [n_features], optional (default: None)
            String labels for each feature. If none, features are simply enumerated

        Returns
        -------
        self : returns an instance of self.
        """
        if len(set(y)) != 2:
            raise Exception(
                "Only binary classification is supported at this time!")
        X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self)

        if feature_labels == None:
            feature_labels = ["ft" + str(i + 1) for i in range(len(X[0]))]
        self.feature_labels = feature_labels

        if type(X) != list:
            X = np.array(X).tolist()
        if 'str' not in str(type(X[0][0])):
            if self.verbose:
                print "Warning: non-categorical data. Trying to discretize. (Please convert categorical values to strings to avoid this.)"
            X = self.discretize(X, y)

        permsdic = defaultdict(
            default_permsdic)  #We will store here the MCMC results

        data = list(X[:])
        #Now find frequent itemsets
        #Mine separately for each class
        data_pos = [x for i, x in enumerate(data) if y[i] == 0]
        data_neg = [x for i, x in enumerate(data) if y[i] == 1]
        assert len(data_pos) + len(data_neg) == len(data)
        try:
            itemsets = [
                r[0] for r in fpgrowth(
                    data_pos, supp=self.minsupport, zmax=self.maxcardinality)
            ]
            itemsets.extend([
                r[0] for r in fpgrowth(
                    data_neg, supp=self.minsupport, zmax=self.maxcardinality)
            ])
        except TypeError:
            itemsets = [
                r[0] for r in fpgrowth(
                    data_pos, supp=self.minsupport, max=self.maxcardinality)
            ]
            itemsets.extend([
                r[0] for r in fpgrowth(
                    data_neg, supp=self.minsupport, max=self.maxcardinality)
            ])
        itemsets = list(set(itemsets))
        if self.verbose:
            print len(itemsets), 'rules mined'
        #Now form the data-vs.-lhs set
        #X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
        X = [set() for j in range(len(itemsets) + 1)]
        X[0] = set(range(len(data)))  #the default rule satisfies all data
        for (j, lhs) in enumerate(itemsets):
            X[j + 1] = set(
                [i for (i, xi) in enumerate(data) if set(lhs).issubset(xi)])
        #now form lhs_len
        lhs_len = [0]
        for lhs in itemsets:
            lhs_len.append(len(lhs))
        nruleslen = Counter(lhs_len)
        lhs_len = array(lhs_len)
        itemsets_all = ['null']
        itemsets_all.extend(itemsets)

        Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = (
            X, np.vstack(
                (y, 1 - y)).T.astype(int), nruleslen, lhs_len, itemsets_all)

        #Do MCMC
        res, Rhat = run_bdl_multichain_serial(self.max_iter,
                                              self.thinning,
                                              self.alpha,
                                              self.listlengthprior,
                                              self.listwidthprior,
                                              Xtrain,
                                              Ytrain,
                                              nruleslen,
                                              lhs_len,
                                              self.maxcardinality,
                                              permsdic,
                                              self.burnin,
                                              self.n_chains,
                                              [None] * self.n_chains,
                                              verbose=self.verbose)

        #Merge the chains
        permsdic = merge_chains(res)

        ###The point estimate, BRL-point
        self.d_star = get_point_estimate(
            permsdic,
            lhs_len,
            Xtrain,
            Ytrain,
            self.alpha,
            nruleslen,
            self.maxcardinality,
            self.listlengthprior,
            self.listwidthprior,
            verbose=self.verbose)  #get the point estimate

        if self.d_star:
            #Compute the rule consequent
            self.theta, self.ci_theta = get_rule_rhs(Xtrain, Ytrain,
                                                     self.d_star, self.alpha,
                                                     True)

        return self
    def fit(self, X, y, sample_weight=None):
        """Fit linear model.

        Derived-from - and meant to override - the fit method of the base class.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data

        y : array_like, shape (n_samples, n_targets)
            Target values. Will be cast to X's dtype if necessary

        sample_weight : numpy array of shape [n_samples]
            Individual weights for each sample

            .. versionadded:: 0.17
               parameter *sample_weight* support to LinearRegression.

        Returns
        -------
        self : returns an instance of self.
        """
        def lstsq(a,
                  b,
                  cond=None,
                  overwrite_a=False,
                  overwrite_b=False,
                  check_finite=True,
                  lapack_driver=None):
            """
            Compute least-squares solution to equation Ax = b.
            Compute a vector x such that the 2-norm ``|b - A x|`` is minimized.

            This code was adapted from the Scipy distribution: https://github.com/scipy/scipy/blob/v1.2.1/scipy/linalg/basic.py#L1047-L1264

            Parameters
            ----------
            a : (M, N) array_like
                Left hand side matrix (2-D array).
            b : (M,) or (M, K) array_like
                Right hand side matrix or vector (1-D or 2-D array).
            cond : float, optional
                Cutoff for 'small' singular values; used to determine effective
                rank of a. Singular values smaller than
                ``rcond * largest_singular_value`` are considered zero.
            overwrite_a : bool, optional
                Discard data in `a` (may enhance performance). Default is False.
            overwrite_b : bool, optional
                Discard data in `b` (may enhance performance). Default is False.
            check_finite : bool, optional
                Whether to check that the input matrices contain only finite numbers.
                Disabling may give a performance gain, but may result in problems
                (crashes, non-termination) if the inputs do contain infinities or NaNs.
            lapack_driver : str, optional
                Which LAPACK driver is used to solve the least-squares problem.
                Options are ``'gelsd'``, ``'gelsy'``, ``'gelss'``. Default
                (``'gelsd'``) is a good choice.  However, ``'gelsy'`` can be slightly
                faster on many problems.  ``'gelss'`` was used historically.  It is
                generally slow but uses less memory.
                .. versionadded:: 0.17.0
            Returns
            -------
            x : (N,) or (N, K) ndarray
                Least-squares solution.  Return shape matches shape of `b`.
            residues : (0,) or () or (K,) ndarray
                Sums of residues, squared 2-norm for each column in ``b - a x``.
                If rank of matrix a is ``< N`` or ``N > M``, or ``'gelsy'`` is used,
                this is a length zero array. If b was 1-D, this is a () shape array
                (numpy scalar), otherwise the shape is (K,).
            rank : int
                Effective rank of matrix `a`.
            s : (min(M,N),) ndarray or None
                Singular values of `a`. The condition number of a is
                ``abs(s[0] / s[-1])``. None is returned when ``'gelsy'`` is used.
            Raises
            ------
            LinAlgError
                If computation does not converge.
            ValueError
                When parameters are wrong.
            See Also
            --------
            optimize.nnls : linear least squares with non-negativity constraint
            Examples
            --------
            >>> from scipy.linalg import lstsq
            >>> import matplotlib.pyplot as plt
            Suppose we have the following data:
            >>> x = np.array([1, 2.5, 3.5, 4, 5, 7, 8.5])
            >>> y = np.array([0.3, 1.1, 1.5, 2.0, 3.2, 6.6, 8.6])
            We want to fit a quadratic polynomial of the form ``y = a + b*x**2``
            to this data.  We first form the "design matrix" M, with a constant
            column of 1s and a column containing ``x**2``:
            >>> M = x[:, np.newaxis]**[0, 2]
            >>> M
            array([[  1.  ,   1.  ],
                   [  1.  ,   6.25],
                   [  1.  ,  12.25],
                   [  1.  ,  16.  ],
                   [  1.  ,  25.  ],
                   [  1.  ,  49.  ],
                   [  1.  ,  72.25]])
            We want to find the least-squares solution to ``M.dot(p) = y``,
            where ``p`` is a vector with length 2 that holds the parameters
            ``a`` and ``b``.
            >>> p, res, rnk, s = lstsq(M, y)
            >>> p
            array([ 0.20925829,  0.12013861])
            Plot the data and the fitted curve.
            >>> plt.plot(x, y, 'o', label='data')
            >>> xx = np.linspace(0, 9, 101)
            >>> yy = p[0] + p[1]*xx**2
            >>> plt.plot(xx, yy, label='least squares fit, $y = a + bx^2$')
            >>> plt.xlabel('x')
            >>> plt.ylabel('y')
            >>> plt.legend(framealpha=1, shadow=True)
            >>> plt.grid(alpha=0.25)
            >>> plt.show()
            """

            a1 = _asarray_validated(a, check_finite=check_finite)
            b1 = _asarray_validated(b, check_finite=check_finite)
            if len(a1.shape) != 2:
                raise ValueError('expected matrix')
            m, n = a1.shape

            if len(b1.shape) == 2:
                nrhs = b1.shape[1]
            else:
                nrhs = 1
            if m != b1.shape[0]:
                raise ValueError('incompatible dimensions')
            if m == 0 or n == 0:  # Zero-sized problem, confuses LAPACK
                x = np.zeros((n, ) + b1.shape[1:],
                             dtype=np.common_type(a1, b1))
                if n == 0:
                    residues = np.linalg.norm(b1, axis=0)**2
                else:
                    residues = np.empty((0, ))
                return x, residues, 0, np.empty((0, ))

            driver = lapack_driver
            if driver is None:
                global default_lapack_driver
                driver = default_lapack_driver
            if driver not in ('gelsd', 'gelsy', 'gelss'):
                raise ValueError('LAPACK driver "%s" is not found' % driver)

            lapack_func, lapack_lwork = get_lapack_funcs(
                (driver, '%s_lwork' % driver), (a1, b1))
            real_data = True if (lapack_func.dtype.kind == 'f') else False

            if m < n:
                # need to extend b matrix as it will be filled with
                # a larger solution matrix
                if len(b1.shape) == 2:
                    b2 = np.zeros((n, nrhs), dtype=lapack_func.dtype)
                    b2[:m, :] = b1
                else:
                    b2 = np.zeros(n, dtype=lapack_func.dtype)
                    b2[:m] = b1
                b1 = b2

            overwrite_a = overwrite_a or _datacopied(a1, a)
            overwrite_b = overwrite_b or _datacopied(b1, b)

            if cond is None:
                cond = np.finfo(lapack_func.dtype).eps

            a1_wrk = np.copy(a1)
            b1_wrk = np.copy(b1)
            lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs, cond)
            x_check, s_check, rank_check, info = lapack_func(
                a1_wrk, b1_wrk, lwork, iwork, cond, False, False)

            driver = 'gelss'
            if driver in ('gelss', 'gelsd'):
                if driver == 'gelss':
                    if not context:
                        a1_wrk = np.copy(a1)
                        b1_wrk = np.copy(b1)
                        lwork, iwork = _compute_lwork(lapack_lwork, m, n, nrhs,
                                                      cond)
                        x, s, rank, info = lapack_func(a1_wrk, b1_wrk, lwork,
                                                       iwork, cond, False,
                                                       False)
                    else:
                        try:
                            # Check that we aren't dealing with an underconstrained problem ...
                            if m < n:
                                pkg.log.error(
                                    Exception(
                                        "Underconstrained problems not yet supported by Magma."
                                    ))

                            # Initialize
                            a1_trans = np.copy(a1, order='F')
                            a1_gpu = gpuarray.to_gpu(a1_trans)

                            # Note that the result for 'x' gets written to the vector inputted for b
                            x_trans = np.copy(b1, order='F')
                            x_gpu = gpuarray.to_gpu(x_trans)

                            # Init singular-value decomposition (SVD) output & buffer arrays
                            s = np.zeros(min(m, n), np.float32)
                            u = np.zeros((m, m), np.float32)
                            vh = np.zeros((n, n), np.float32)

                            # Query and allocate optimal workspace
                            # n.b.: - the result for 'x' gets written to the input vector for b, so we just label b->x
                            #       - assume magma variables lda=ldb=m throughout here
                            lwork_SVD = magma.magma_sgesvd_buffersize(
                                'A', 'A', m, n, a1_trans.ctypes.data, m,
                                s.ctypes.data, u.ctypes.data, m,
                                vh.ctypes.data, n)

                            # For some reason, magma_sgels_buffersize() does not return the right value for large problems, so
                            # we compute the values used for the validation check (see Magma SGELS documentation) directly and use that
                            #lwork_LS = magma.magma_sgels_buffersize('n', m, n, nrhs, a1_trans.ctypes.data, m, x_trans.ctypes.data, m)
                            nb = magma.magma_get_sgeqrf_nb(m, n)
                            check = (m - n + nb) * (nrhs + nb) + nrhs * nb
                            lwork_LS = check

                            # Allocate workspaces
                            hwork_SVD = np.zeros(lwork_SVD,
                                                 np.float32,
                                                 order='F')
                            hwork_LS = np.zeros(lwork_LS, np.float32)

                            # Compute SVD
                            timer.start("SVD")
                            magma.magma_sgesvd('A', 'A', m, n,
                                               a1_trans.ctypes.data, m,
                                               s.ctypes.data, u.ctypes.data, m,
                                               vh.ctypes.data, n,
                                               hwork_SVD.ctypes.data,
                                               lwork_SVD)
                            timer.stop("SVD")

                            # Note, the use of s_i>rcond here; this is meant to select
                            # values that are effectively non-zero.  Results will depend
                            # somewhat on the choice for this value.  This criterion was
                            # adopted from that utilized by scipy.linalg.basic.lstsq()
                            rcond = np.finfo(lapack_func.dtype).eps * s[0]
                            rank = sum(1 for s_i in s if s_i > rcond)

                            # Run LS solver
                            timer.start("LS")
                            magma.magma_sgels_gpu('n', m, n, nrhs,
                                                  a1_gpu.gpudata, m,
                                                  x_gpu.gpudata, m,
                                                  hwork_LS.ctypes.data,
                                                  lwork_LS)
                            timer.stop("LS")

                            # Unload result from GPU
                            x = x_gpu.get()

                        except magma.MagmaError as e:
                            info = e._status
                        else:
                            info = 0

                elif driver == 'gelsd':
                    if real_data:
                        if not context:
                            raise Exception(
                                "For some reason, the CUDA implementation of fit() is being called when context is False."
                            )
                        else:
                            raise Exception(
                                "gelsd not supported using Cuda yet")
                    else:  # complex data
                        raise LinAlgError(
                            "driver=%s not yet supported for complex data" %
                            (driver))
                if info > 0:
                    raise LinAlgError(
                        "SVD did not converge in Linear Least Squares")
                if info < 0:
                    raise ValueError(
                        'illegal value in %d-th argument of internal %s' %
                        (-info, lapack_driver))
                resids = np.asarray([], dtype=x.dtype)
                if m > n:
                    x1 = x[:n]
                    if rank == n:
                        resids = np.sum(np.abs(x[n:])**2, axis=0)
                    x = x1

            elif driver == 'gelsy':
                raise LinAlgError("driver=%s not yet supported" % (driver))

            #pkg.log.close("Done", time_elapsed=True)
            return x, resids, rank, s

        n_jobs_ = self.n_jobs
        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csr', 'csc', 'coo'],
                         y_numeric=True,
                         multi_output=True)

        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
            raise ValueError("Sample weights must be 1D array or scalar")

        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X,
            sample_weight=sample_weight)

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            X, y = _rescale_data(X, y, sample_weight)

        if sp.issparse(X):
            raise Exception(
                "Sparse matrices not supported yet for Cuda implementation.")
        else:
            ###############################
            self.coef_, self._residues, self.rank_, self.singular_ = lstsq(
                X, y)
            ###############################
            self.coef_ = self.coef_.T

        if y.ndim == 1:
            self.coef_ = np.ravel(self.coef_)
        self._set_intercept(X_offset, y_offset, X_scale)
        return self
Beispiel #53
0
    def fit(self, X, y):
        """Fit classifier.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """

        # Validate inputs X and y
        X, y = check_X_y(X, y)
        X = check_array(X)
        self._set_n_classes(y)

        n_samples = X.shape[0]

        # initialize matrix for storing newly generated features
        new_features = np.zeros([n_samples, self.n_base_estimators_])

        # build CV datasets
        X_new, y_new, index_lists = split_datasets(
            X,
            y,
            n_folds=self.n_folds,
            shuffle_data=self.shuffle_data,
            random_state=self.random_state)

        # iterate over all base classifiers
        for i, clf in enumerate(self.base_estimators):
            # iterate over all folds
            for j in range(self.n_folds):
                # build train and test index
                full_idx = list(range(n_samples))
                test_idx = index_lists[j]
                train_idx = list_diff(full_idx, test_idx)
                X_train, y_train = X_new[train_idx, :], y_new[train_idx]
                X_test, y_test = X_new[test_idx, :], y_new[test_idx]

                # train the classifier
                clf.fit(X_train, y_train)

                # generate the new features on the pseudo test set
                if self.use_proba:
                    new_features[test_idx, i] = clf.predict_proba(X_test)[:, 1]
                else:
                    new_features[test_idx, i] = clf.predict(X_test)

        # build the new dataset for training
        if self.keep_original:
            X_new_comb = np.concatenate([X_new, new_features], axis=1)
        else:
            X_new_comb = new_features
        y_new_comb = y_new

        # train the meta classifier
        self.meta_clf.fit(X_new_comb, y_new_comb)
        self.fitted_ = True

        # train all base classifiers on the full train dataset
        # iterate over all base classifiers
        for i, clf in enumerate(self.base_estimators):
            clf.fit(X_new, y_new)

        return
Beispiel #54
0
    def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
        y : array-like, shape (n_samples,)
            Target vector relative to X.
        sample_weight : array-like, shape (n_samples,)
            Weight given to each sample.
        Returns
        -------
        self : object
        """
        X, y = check_X_y(X,
                         y,
                         copy=False,
                         accept_sparse=['csr'],
                         y_numeric=True,
                         dtype=[np.float64, np.float32])
        if sample_weight is not None:
            sample_weight = np.array(sample_weight)
            check_consistent_length(y, sample_weight)
        else:
            sample_weight = np.ones_like(y)

        if self.epsilon < 1.0:
            raise ValueError(
                "epsilon should be greater than or equal to 1.0, got %f" %
                self.epsilon)

        if self.warm_start and hasattr(self, 'coef_'):
            parameters = np.concatenate((self.coef_, [self.intercept_]))
        else:
            if self.fit_intercept:
                parameters = np.zeros(X.shape[1] + 1)
            else:
                parameters = np.zeros(X.shape[1])
            # Make sure to initialize the scale parameter to a strictly
            # positive value:
            parameters[-1] = 1

        # Sigma or the scale factor should be non-negative.
        # Setting it to be zero might cause undefined bounds hence we set it
        # to a value close to zero.
        bounds = np.tile([-np.inf, np.inf], (parameters.shape[0], 1))
        bounds[-1][0] = np.finfo(np.float64).eps * 10

        parameters, f, dict_ = optimize.fmin_l_bfgs_b(
            _huber_loss_and_gradient,
            parameters,
            args=(X, y, self.epsilon, self.alpha, self.sigma, sample_weight),
            maxiter=self.max_iter,
            pgtol=self.tol,
            bounds=bounds,
            iprint=0)
        if dict_['warnflag'] == 2:
            raise ValueError("HuberRegressor convergence failed:"
                             " l-BFGS-b solver terminated with %s" %
                             dict_['task'].decode('ascii'))
        # In scipy <= 1.0.0, nit may exceed maxiter.
        # See https://github.com/scipy/scipy/issues/7854.
        self.n_iter_ = min(dict_['nit'], self.max_iter)
        if self.fit_intercept:
            self.intercept_ = parameters[-1]
        else:
            self.intercept_ = 0.0
        self.coef_ = parameters[:X.shape[1]]

        residual = np.abs(y - safe_sparse_dot(X, self.coef_) - self.intercept_)
        return self
Beispiel #55
0
    def _partial_fit(self,
                     X,
                     y,
                     classes=None,
                     _refit=False,
                     sample_weight=None):
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        X, y = check_X_y(X, y)

        if self.bounds is None:
            warnings.warn(
                "Bounds have not been specified and will be calculated on the data provided. This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify bounds for each dimension.",
                PrivacyLeakWarning)
            self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

        self.bounds = check_bounds(self.bounds, shape=X.shape[1])
        X = clip_to_bounds(X, self.bounds)

        self.epsilon_ = self.var_smoothing

        if _refit:
            self.classes_ = None

        if _check_partial_fit_first_call(self, classes):
            n_features = X.shape[1]
            n_classes = len(self.classes_)
            self.theta_ = np.zeros((n_classes, n_features))
            self.sigma_ = np.zeros((n_classes, n_features))

            self.class_count_ = np.zeros(n_classes, dtype=np.float64)

            if self.priors is not None:
                priors = np.asarray(self.priors)

                if len(priors) != n_classes:
                    raise ValueError(
                        "Number of priors must match number of classes.")
                if not np.isclose(priors.sum(), 1.0):
                    raise ValueError("The sum of the priors should be 1.")
                if (priors < 0).any():
                    raise ValueError("Priors must be non-negative.")
                self.class_prior_ = priors
            else:
                # Initialize the priors to zeros for each class
                self.class_prior_ = np.zeros(len(self.classes_),
                                             dtype=np.float64)
        else:
            if X.shape[1] != self.theta_.shape[1]:
                raise ValueError(
                    "Number of features %d does not match previous data %d." %
                    (X.shape[1], self.theta_.shape[1]))
            # Put epsilon back in each time
            self.sigma_[:, :] -= self.epsilon_

        classes = self.classes_

        unique_y = np.unique(y)
        unique_y_in_classes = np.in1d(unique_y, classes)

        if not np.all(unique_y_in_classes):
            raise ValueError(
                "The target label(s) %s in y do not exist in the initial classes %s"
                % (unique_y[~unique_y_in_classes], classes))

        noisy_class_counts = self._noisy_class_counts(y)

        for _i, y_i in enumerate(unique_y):
            i = classes.searchsorted(y_i)
            X_i = X[y == y_i, :]

            n_i = noisy_class_counts[_i]

            new_theta, new_sigma = self._update_mean_variance(
                self.class_count_[i],
                self.theta_[i, :],
                self.sigma_[i, :],
                X_i,
                n_noisy=n_i)

            self.theta_[i, :] = new_theta
            self.sigma_[i, :] = new_sigma
            self.class_count_[i] += n_i

        self.sigma_[:, :] += self.epsilon_

        # Update if only no priors is provided
        if self.priors is None:
            # Empirical prior, with sample_weight taken into account
            self.class_prior_ = self.class_count_ / self.class_count_.sum()

        self.accountant.spend(self.epsilon, 0)

        return self
 def fit(self, X, y):
     y, _ = check_target_type(y, indicate_one_vs_all=True)
     X, y = check_X_y(X, y, accept_sparse=True)
     return self
Beispiel #57
0
    def fit(self, X, y):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples.

        y : array-like, shape=(n_samples,)
            Target values.

        Returns
        -------
        self : object
        """

        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        # TODO: add support for mixed-typed (numerical + categorical) data
        # TODO: add support for missing data
        # TODO: add support for pre-binned data (pass-through)?
        # TODO: test input checking
        X, y = check_X_y(X, y, dtype=[np.float32, np.float64])
        y = self._encode_y(y)
        if X.shape[0] == 1 or X.shape[1] == 1:
            raise ValueError(
                'Passing only one sample or one feature is not supported yet. '
                'See numba issue #3569.'
            )
        rng = check_random_state(self.random_state)

        self._validate_parameters()
        self.n_features_ = X.shape[1]  # used for validation in predict()

        if self.verbose:
            print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="",
                  flush=True)
        tic = time()
        self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng)
        X_binned = self.bin_mapper_.fit_transform(X)
        toc = time()
        if self.verbose:
            duration = toc - tic
            troughput = X.nbytes / duration
            print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)")

        self.loss_ = self._get_loss()

        if self.scoring is not None and self.validation_split is not None:
            # stratify for classification
            stratify = y if hasattr(self.loss_, 'predict_proba') else None

            X_binned_train, X_binned_val, y_train, y_val = train_test_split(
                X_binned, y, test_size=self.validation_split,
                stratify=stratify, random_state=rng)
            if X_binned_train.size == 0 or X_binned_val.size == 0:
                raise ValueError(
                    f'Not enough data (n_samples={X_binned.shape[0]}) to '
                    f'perform early stopping with validation_split='
                    f'{self.validation_split}. Use more training data or '
                    f'adjust validation_split.'
                )
            # Histogram computation is faster on feature-aligned data.
            X_binned_train = np.asfortranarray(X_binned_train)
        else:
            X_binned_train, y_train = X_binned, y
            X_binned_val, y_val = None, None

        # Subsample the training set for score-based monitoring.
        subsample_size = 10000
        if X_binned_train.shape[0] < subsample_size:
            X_binned_small_train = np.ascontiguousarray(X_binned_train)
            y_small_train = y_train
        else:
            indices = rng.choice(
                np.arange(X_binned_train.shape[0]), subsample_size)
            X_binned_small_train = X_binned_train[indices]
            y_small_train = y_train[indices]

        if self.verbose:
            print("Fitting gradient boosted rounds:")

        n_samples = X_binned_train.shape[0]
        # values predicted by the trees. Used as-is in regression, and
        # transformed into probas and / or classes for classification
        raw_predictions = np.zeros(
            shape=(n_samples, self.n_trees_per_iteration_),
            dtype=y_train.dtype
        )
        # gradients and hessians are 1D arrays of size
        # n_samples * n_trees_per_iteration
        gradients, hessians = self.loss_.init_gradients_and_hessians(
            n_samples=n_samples,
            n_trees_per_iteration=self.n_trees_per_iteration_
        )
        # predictors_ is a matrix of TreePredictor objects with shape
        # (n_iter_, n_trees_per_iteration)
        self.predictors_ = predictors = []

        scorer = check_scoring(self, self.scoring)
        self.train_scores_ = []
        if self.scoring is not None:
            # Add predictions of the initial model (before the first tree)
            predicted_train = self._predict_binned(X_binned_train)
            score_train = scorer._sign * scorer._score_func(y_train,
                                                            predicted_train)
            self.train_scores_.append(score_train)

            if self.validation_split is not None:
                self.validation_scores_ = []
                predicted_val = self._predict_binned(X_binned_val)
                score_val = scorer._sign * scorer._score_func(y_val,
                                                              predicted_val)
                self.validation_scores_.append(score_val)

        for iteration in range(self.max_iter):

            if self.verbose:
                iteration_start_time = time()
                print(f"[{iteration + 1}/{self.max_iter}] ", end='',
                      flush=True)

            # Update gradients and hessians, inplace
            self.loss_.update_gradients_and_hessians(gradients, hessians,
                                                     y_train, raw_predictions)

            predictors.append([])

            # Build `n_trees_per_iteration` trees.
            for k, (gradients_at_k, hessians_at_k) in enumerate(zip(
                    np.array_split(gradients, self.n_trees_per_iteration_),
                    np.array_split(hessians, self.n_trees_per_iteration_))):
                # the xxxx_at_k arrays are **views** on the original arrays.
                # Note that for binary classif and regressions,
                # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the
                # whole array.

                grower = TreeGrower(
                    X_binned_train, gradients_at_k, hessians_at_k,
                    max_bins=self.max_bins,
                    n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_,
                    max_leaf_nodes=self.max_leaf_nodes,
                    max_depth=self.max_depth,
                    min_samples_leaf=self.min_samples_leaf,
                    l2_regularization=self.l2_regularization,
                    shrinkage=self.learning_rate)
                grower.grow()

                acc_apply_split_time += grower.total_apply_split_time
                acc_find_split_time += grower.total_find_split_time

                predictor = grower.make_predictor(
                    bin_thresholds=self.bin_mapper_.bin_thresholds_)
                predictors[-1].append(predictor)

                tic_pred = time()

                # prepare leaves_data so that _update_raw_predictions can be
                # @njitted
                leaves_data = [(l.value, l.sample_indices)
                               for l in grower.finalized_leaves]
                _update_raw_predictions(leaves_data, raw_predictions[:, k])
                toc_pred = time()
                acc_prediction_time += toc_pred - tic_pred

            should_stop = self._check_early_stopping(
                scorer, X_binned_small_train, y_small_train,
                X_binned_val, y_val)

            if self.verbose:
                self._print_iteration_stats(iteration_start_time)

            if should_stop:
                break

        if self.verbose:
            duration = time() - fit_start_time
            n_total_leaves = sum(
                predictor.get_n_leaf_nodes()
                for predictors_at_ith_iteration in self.predictors_
                for predictor in predictors_at_ith_iteration)
            n_predictors = sum(
                len(predictors_at_ith_iteration)
                for predictors_at_ith_iteration in self.predictors_)
            print(f"Fit {n_predictors} trees in {duration:.3f} s, "
                  f"({n_total_leaves} total leaves)")
            print(f"{'Time spent finding best splits:':<32} "
                  f"{acc_find_split_time:.3f}s")
            print(f"{'Time spent applying splits:':<32} "
                  f"{acc_apply_split_time:.3f}s")
            print(f"{'Time spent predicting:':<32} "
                  f"{acc_prediction_time:.3f}s")

        self.train_scores_ = np.asarray(self.train_scores_)
        if self.scoring is not None and self.validation_split is not None:
            self.validation_scores_ = np.asarray(self.validation_scores_)
        return self
Beispiel #58
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            relative_penalties=None,
            groups=None):
        """Fit the model to training data. If n_splits > 1 also run n-fold cross
        validation on all values in lambda_path.

        The model will be fit n+1 times. On the first pass, the lambda_path
        will be determined, on the remaining passes, the model performance for
        each value of lambda. After cross validation, the attribute
        `cv_mean_score_` will contain the mean score over all folds for each
        value of lambda, and `cv_standard_error_` will contain the standard
        error of `cv_mean_score_` for each value of lambda. The value of lambda
        which achieves the best performance in cross validation will be saved
        to `lambda_max_` additionally, the largest value of lambda s.t.:
            cv_score(l) >= cv_score(lambda_max_) -\
                           cut_point * standard_error(lambda_max_)
        will be saved to `lambda_best_`.

        Parameters
        ----------
        X : array, shape (n_samples, n_features)
            Input features

        y : array, shape (n_samples,)
            Target values

        sample_weight : array, shape (n_samples,)
            Optional weight vector for observations

        relative_penalties: array, shape (n_features,)
            Optional relative weight vector for penalty.
            0 entries remove penalty.

        groups: array, shape (n_samples,)
            Group labels for the samples used while splitting the dataset into train/test set.
            If the groups are specified, the groups will be passed to sklearn.model_selection.GroupKFold.
            If None, then data will be split randomly for K-fold cross-validation via sklearn.model_selection.KFold.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y, accept_sparse='csr', ensure_min_samples=2)
        if sample_weight is None:
            sample_weight = np.ones(X.shape[0])
        else:
            sample_weight = np.asarray(sample_weight)

        if not np.isscalar(self.lower_limits):
            self.lower_limits = np.asarray(self.lower_limits)
            if len(self.lower_limits) != X.shape[1]:
                raise ValueError("lower_limits must equal number of features")

        if not np.isscalar(self.upper_limits):
            self.upper_limits = np.asarray(self.upper_limits)
            if len(self.upper_limits) != X.shape[1]:
                raise ValueError("upper_limits must equal number of features")

        if any(self.lower_limits > 0) if isinstance(
                self.lower_limits, np.ndarray) else self.lower_limits > 0:
            raise ValueError("lower_limits must be non-positive")

        if any(self.upper_limits < 0) if isinstance(
                self.upper_limits, np.ndarray) else self.upper_limits < 0:
            raise ValueError("upper_limits must be positive")

        if self.alpha > 1 or self.alpha < 0:
            raise ValueError("alpha must be between 0 and 1")

        if self.n_splits > 0 and self.n_splits < 3:
            raise ValueError("n_splits must be at least 3")

        self._fit(X, y, sample_weight, relative_penalties)

        if self.n_splits >= 3:
            if groups is None:
                self._cv = KFold(n_splits=self.n_splits,
                                 shuffle=True,
                                 random_state=self.random_state)
            else:
                self._cv = GroupKFold(n_splits=self.n_splits)

            cv_scores = _score_lambda_path(self,
                                           X,
                                           y,
                                           groups,
                                           sample_weight,
                                           relative_penalties,
                                           self.scoring,
                                           n_jobs=self.n_jobs,
                                           verbose=self.verbose)

            self.cv_mean_score_ = np.atleast_1d(np.mean(cv_scores, axis=0))
            self.cv_standard_error_ = np.atleast_1d(stats.sem(cv_scores))

            self.lambda_max_inx_ = np.argmax(self.cv_mean_score_)
            self.lambda_max_ = self.lambda_path_[self.lambda_max_inx_]

            target_score = self.cv_mean_score_[self.lambda_max_inx_] -\
                self.cut_point * self.cv_standard_error_[self.lambda_max_inx_]

            self.lambda_best_inx_ = np.argwhere(
                self.cv_mean_score_ >= target_score)[0]
            self.lambda_best_ = self.lambda_path_[self.lambda_best_inx_]

            self.coef_ = self.coef_path_[..., self.lambda_best_inx_]
            self.coef_ = self.coef_.squeeze(axis=self.coef_.ndim - 1)
            self.intercept_ = self.intercept_path_[
                ..., self.lambda_best_inx_].squeeze()
            if self.intercept_.shape == ():  # convert 0d array to scalar
                self.intercept_ = float(self.intercept_)

        return self
 def fit(self, X, y):
     y, _ = check_target_type(y, indicate_one_vs_all=True)
     X, y = check_X_y(X, y, accept_sparse=False)
     self.sampling_strategy_ = "sampling_strategy_"
     return self
Beispiel #60
0
    def sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Call the parent function
        super(ADASYN, self).sample(X, y)

        # Keep the samples from the majority class
        X_resampled = X.copy()
        y_resampled = y.copy()

        # Define the number of sample to create
        # We handle only two classes problem for the moment.
        if self.ratio == 'auto':
            num_samples = (self.stats_c_[self.maj_c_] -
                           self.stats_c_[self.min_c_])
        else:
            num_samples = int((self.ratio * self.stats_c_[self.maj_c_]) -
                              self.stats_c_[self.min_c_])

        # Start by separating minority class features and target values.
        X_min = X[y == self.min_c_]

        # Print if verbose is true
        if self.verbose:
            print('Finding the {} nearest neighbours...'.format(self.k))

        # Look for k-th nearest neighbours, excluding, of course, the
        # point itself.
        self.nearest_neighbour.fit(X)

        # Get the distance to the NN
        _, ind_nn = self.nearest_neighbour.kneighbors(X_min)

        # Compute the ratio of majority samples next to minority samples
        ratio_nn = np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) / self.k
        # Normalize the ratio
        ratio_nn /= np.sum(ratio_nn)

        # Compute the number of sample to be generated
        num_samples_nn = np.round(ratio_nn * num_samples).astype(int)

        # For each minority samples
        for x_i, x_i_nn, num_sample_i in zip(X_min, ind_nn, num_samples_nn):
            # Fix the the seed
            np.random.seed(self.random_state)
            # Pick-up the neighbors wanted
            nn_zs = np.random.randint(1, high=self.k + 1, size=num_sample_i)

            # Create a new sample
            for nn_z in nn_zs:
                step = np.random.uniform()
                x_gen = x_i + step * (x_i - X[x_i_nn[nn_z], :])
                X_resampled = np.vstack((X_resampled, x_gen))
                y_resampled = np.hstack((y_resampled, self.min_c_))

        if self.verbose:
            print("Over-sampling performed: {}".format(Counter(y_resampled)))

        return X_resampled, y_resampled