def fit(self, x, y): """ Perform iteration to adjust the prototype vector in order to classify any new incoming points using existing data points Parameters ------- x: input y: label """ x, y = check_X_y(x, y) check_classification_targets(y) self.classes_ = np.unique(y) self.X_ = x self.y_ = y while self.epsilon >= 0.01: rnd_i = np.random.randint(0, len(x)) rnd_s = x[rnd_i] target_y = y[rnd_i] self.epsilon = self.epsilon - self.epsilon_dec_factor closest_pvector = self.find_closest(rnd_s)[1] if target_y == closest_pvector.class_id: closest_pvector.update(rnd_s) else: closest_pvector.update(rnd_s, False) closest_pvector.epsilon = self.epsilon return self.p_vectors
def fit(self, X, y): """ Doc strings here. """ check_classification_targets(y) if type_of_target(y) == 'multilabel-indicator': # Fit multilabel binary task. self.multilabel = True return self.fit_multilabel(X, y) num_classes = len(np.unique(y)) self.uniform_posterior = np.ones(num_classes) / num_classes self.leaf_to_posterior = {} for leaf_id in np.unique(X): idxs_in_leaf = np.where(X == leaf_id)[0] class_counts = [ len(np.where(y[idxs_in_leaf] == y_val)[0]) for y_val in np.unique(y) ] posteriors = np.nan_to_num( np.array(class_counts) / np.sum(class_counts)) if self.finite_sample_correction: posteriors = self._finite_sample_correction( posteriors, len(idxs_in_leaf), len(np.unique(y))) self.leaf_to_posterior[leaf_id] = posteriors self._is_fitted = True return self
def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]) -> "DAGClassifier": """ Fits the sm model using the concat of X and y. """ # clf target check check_classification_targets(y) # encode the categories to be numeric enc = LabelEncoder() y = y.copy() y[:] = enc.fit_transform(y) # store the classes from the LabelEncoder self.classes_ = enc.classes_ # class number checks n_classes = len(self.classes_) if n_classes < 2: raise ValueError("This solver needs samples of at least 2 classes" " in the data, but the data contains only one" " class: {}".format(self.classes_[0])) if n_classes > 2: raise ValueError( "This solver does not support more than 2 classes") # store the private attr __target_dist_type self.__target_dist_type = "bin" # fit the NOTEARS model super().fit(X, y) return self
def fit(self, X, y): """Check inputs and statistics of the sampler. You should use ``fit_resample`` in all cases. Parameters ---------- X : {array-like, dataframe, sparse matrix} of shape \ (n_samples, n_features) Data array. y : array-like of shape (n_samples,) Target array. Returns ------- self : object Return the instance itself. """ # we need to overwrite SamplerMixin.fit to bypass the validation if self.validate: check_classification_targets(y) X, y, _ = self._check_X_y(X, y, accept_sparse=self.accept_sparse) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type) return self
def _validate_y(self, y): y = column_or_1d(y, warn=True) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) return y
def fit_resample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, dataframe, sparse matrix} of shape \ (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like of shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {array-like, dataframe, sparse matrix} of shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ check_classification_targets(y) arrays_transformer = ArraysTransformer(X, y) X, y, binarize_y = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type) output = self._fit_resample(X, y) y_ = label_binarize(output[1], np.unique(y)) if binarize_y else output[1] X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
def fit(self, X, y): check_classification_targets(y) le = LabelEncoder() y_ind = le.fit_transform(y) self.classes_ = classes = le.classes_ n_classes = classes.size X_clas = [] for cur_class in range(n_classes): center_mask = y_ind == cur_class sentence = ' '.join(list(chain(*X[center_mask]))) X_clas.append(sentence) tfidf = TfidfVectorizer(norm=self.norm, use_idf=self.use_idf, smooth_idf=self.smooth_idf, sublinear_tf=self.sublinear_tf) self.tfidf_array_ = tfidf.fit_transform(X_clas) self.tfidf_ = tfidf self.fitted = True return self
def fit(self, X, y): ''' Fits variational Bayesian Logistic Regression Parameters ---------- X: array-like of size [n_samples, n_features] Matrix of explanatory variables y: array-like of size [n_samples] Vector of dependent variables Returns ------- self: object self ''' # preprocess data X, y = check_X_y(X, y, dtype=np.float64) check_classification_targets(y) self.classes_ = np.unique(y) n_classes = len(self.classes_) # take into account bias term if required n_samples, n_features = X.shape n_features = n_features + int(self.fit_intercept) if self.fit_intercept: X = np.hstack((np.ones([n_samples, 1]), X)) # handle multiclass problems using One-vs-Rest if n_classes < 2: raise ValueError("Need samples of at least 2 classes") if n_classes > 2: self.coef_, self.sigma_ = [0] * n_classes, [0] * n_classes self.intercept_ = [0] * n_classes else: self.coef_, self.sigma_, self.intercept_ = [0], [0], [0] # huperparameters of a = self.a + 0.5 * n_features b = self.b for i in range(len(self.coef_)): if n_classes == 2: pos_class = self.classes_[1] else: pos_class = self.classes_[i] mask = (y == pos_class) y_bin = np.ones(y.shape, dtype=np.float64) y_bin[~mask] = 0 coef_, sigma_ = self._fit(X, y_bin, a, b) intercept_ = 0 if self.fit_intercept: intercept_ = coef_[0] coef_ = coef_[1:] self.coef_[i] = coef_ self.intercept_[i] = intercept_ self.sigma_[i] = sigma_ self.coef_ = np.asarray(self.coef_) return self
def fit(self, X, y=None): """Compute the bin edges for each feature. Parameters ---------- X : array-like, shape = (n_samples, n_timestamps) Data to transform. y : None or array-like, shape = (n_samples,) Class labels for each sample. Only used if ``strategy='entropy'``. """ if self.strategy == 'entropy': if y is None: raise ValueError("y cannot be None if strategy='entropy'.") X, y = check_X_y(X, y, dtype='float64') check_classification_targets(y) else: X = check_array(X, dtype='float64') n_samples, n_timestamps = X.shape self._n_timestamps_fit = n_timestamps self._alphabet = self._check_params(n_samples) self._check_constant(X) self.bin_edges_ = self._compute_bins(X, y, n_timestamps, self.n_bins, self.strategy) return self
def _encode_y(self, y): """ Encode classes into {0, ..., n_classes - 1} and sets attributes classes_, n_classes_ and n_trees_per_iteration_ Parameters ---------- y : ndarrray Array of input labels Returns ------- output : ndarray Encoded array of labels """ # # and n_trees_per_iteration_ check_classification_targets(y) label_encoder = LabelEncoder() encoded_y = label_encoder.fit_transform(y) self.classes_ = label_encoder.classes_ n_classes_ = self.classes_.shape[0] self._n_classes_ = n_classes_ # only 1 tree for binary classification. # TODO: For multiclass classification, we build 1 tree per class. self.n_trees_per_iteration_ = 1 if n_classes_ <= 2 else n_classes_ encoded_y = np.ascontiguousarray(encoded_y, dtype=np.float32) return encoded_y
def fit(self, X, y): """ Fits transformed data X given corresponding class labels y. Attributes --- X : array of shape [n_samples, n_features] the transformed input data y : array of shape [n_samples] the class labels """ check_classification_targets(y) num_classes = len(np.unique(y)) self.uniform_posterior = np.ones(num_classes) / num_classes self.leaf_to_posterior = {} for leaf_id in np.unique(X): idxs_in_leaf = np.where(X == leaf_id)[0] class_counts = [ len(np.where(y[idxs_in_leaf] == y_val)[0]) for y_val in np.unique(y) ] posteriors = np.nan_to_num(np.array(class_counts) / np.sum(class_counts)) if self.finite_sample_correction: posteriors = self._finite_sample_correction( posteriors, len(idxs_in_leaf), len(np.unique(y)) ) self.leaf_to_posterior[leaf_id] = posteriors self._is_fitted = True return self
def fit(self, X, y): # Check the algorithm parameters self._check_params() # Check that X and y have correct shape X, y = check_X_y(X, y, y_numeric=False, dtype=[np.single, np.double]) check_classification_targets(y) # Encode labels le = preprocessing.LabelEncoder() le.fit(y) self.classes_ = le.classes_ y_ = le.transform(y) # Convert to 2d array y_ = y_.reshape((-1, 1)) self.n_outputs_ = y_.shape[1] self.n_classes_ = len(self.classes_) self.n_features_ = X.shape[1] # Classifier can't train when only one class is present. # Trivial case if self.n_classes_ == 1: return self # Get random seed rs_ = check_random_state(self.random_state) seed_ = rs_.randint(0, np.iinfo('i').max) # Define type of data fptype = getFPType(X) # Fit the model train_algo = d4p.gbt_classification_training( fptype=fptype, nClasses=self.n_classes_, splitMethod=self.split_method, maxIterations=self.max_iterations, maxTreeDepth=self.max_tree_depth, shrinkage=self.shrinkage, minSplitLoss=self.min_split_loss, lambda_=self.reg_lambda, observationsPerTreeFraction=self.observations_per_tree_fraction, featuresPerNode=self.features_per_node, minObservationsInLeafNode=self.min_observations_in_leaf_node, memorySavingMode=self.memory_saving_mode, maxBins=self.max_bins, minBinSize=self.min_bin_size, engine=d4p.engines_mcg59(seed=seed_)) train_result = train_algo.compute(X, y_) # Store the model self.daal_model_ = train_result.model # Return the classifier return self
def fit(self,X,y): ''' Fits variational Bayesian Logistic Regression Parameters ---------- X: array-like of size [n_samples, n_features] Matrix of explanatory variables y: array-like of size [n_samples] Vector of dependent variables Returns ------- self: object self ''' # preprocess data X,y = check_X_y( X, y , dtype = np.float64) check_classification_targets(y) self.classes_ = np.unique(y) n_classes = len(self.classes_) # take into account bias term if required n_samples, n_features = X.shape n_features = n_features + int(self.fit_intercept) if self.fit_intercept: X = np.hstack( (np.ones([n_samples,1]),X)) # handle multiclass problems using One-vs-Rest if n_classes < 2: raise ValueError("Need samples of at least 2 classes") if n_classes > 2: self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes self.intercept_ = [0]*n_classes else: self.coef_, self.sigma_, self.intercept_ = [0],[0],[0] # huperparameters of a = self.a + 0.5 * n_features b = self.b for i in range(len(self.coef_)): if n_classes == 2: pos_class = self.classes_[1] else: pos_class = self.classes_[i] mask = (y == pos_class) y_bin = np.ones(y.shape, dtype=np.float64) y_bin[~mask] = 0 coef_, sigma_ = self._fit(X,y_bin,a,b) intercept_ = 0 if self.fit_intercept: intercept_ = coef_[0] coef_ = coef_[1:] self.coef_[i] = coef_ self.intercept_[i] = intercept_ self.sigma_[i] = sigma_ self.coef_ = np.asarray(self.coef_) return self
def fit(self, X, y): """A fitting function for a classifier. Parameters ---------- X : array-like, shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values. An array of int. Returns ------- self : object Returns self. """ # Check that X and y have correct shape X, y = check_X_y(X, y) check_classification_targets(y) # Store training set and number of features self.X_ = X self.y_ = y self.n_features_ = X.shape[1] # Establish subspace if self.given_subspace is None: self.subspace_ = self._assumed_subspace() else: self.subspace_ = np.array(self.given_subspace) # Acquire subspaced X subspaced_X = X[:, self.subspace_].astype('float64') # Store the classes seen during fit self.classes_, y = np.unique(y, return_inverse=True) # Scaler self.scaler_ = MinMaxScaler() self.scaler_.fit(subspaced_X) # Expose self.model_ = self.expose(subspaced_X, y) # HSV self._hue = np.argmax(self.model_, axis=2) / float(len(self.classes_)) self._saturation = np.max(self.model_, axis=2) - \ np.min(self.model_, axis=2) self._value = np.max(self.model_, axis=2) self._hsv = np.dstack((self._hue, self._saturation, self._value)) # Calculate measures self._calculate_measures() # Prepare linear model self.linear_model_ = self.model_.reshape((-1, len(self.classes_))) self.linear_model_ = np.divide(self.linear_model_, np.sum(self.linear_model_, axis=0)) # Return the classifier return self
def fit(self, X: np.array, y: np.array, sample_weight: np.array = None) -> Odte: # Check parameters are Ok. if self.n_estimators < 3: raise ValueError( f"n_estimators must be greater than 2 but got (n_estimators=\ {self.n_estimators})") check_classification_targets(y) X, y = self._validate_data(X, y) # if weights is None return np.ones sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64) check_classification_targets(y) # Initialize computed parameters # Build the estimator self.max_features_ = self._initialize_max_features() # build base_estimator_ self._validate_estimator() self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_: int = self.classes_.shape[0] self.estimators_: List[BaseEstimator] = [] self.subspaces_: List[Tuple[int, ...]] = [] result = self._train(X, y, sample_weight) self.estimators_, self.subspaces_ = tuple(zip(*result)) # type: ignore return self
def _validate_y(self, y): """Validate the label vector.""" y = column_or_1d(y, warn=True) check_classification_targets(y) self.classes_, y_encoded = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) return y
def fit(self, x, y): from cvxopt import matrix, solvers solvers.options['show_progress'] = False check_classification_targets(y) x, y = check_X_y(x, y) x_s, x_u = x[y == +1, :], x[y == 0, :] n_s, n_u = len(x_s), len(x_u) p_p = self.prior p_n = 1 - self.prior p_s = p_p**2 + p_n**2 k_s = self._basis(x_s) k_u = self._basis(x_u) d = k_u.shape[1] P = np.zeros((d + 2 * n_u, d + 2 * n_u)) P[:d, :d] = self.lam * np.eye(d) q = np.vstack((-p_s / (n_s * (p_p - p_n)) * k_s.T.dot(np.ones( (n_s, 1))), -p_n / (n_u * (p_p - p_n)) * np.ones( (n_u, 1)), -p_p / (n_u * (p_p - p_n)) * np.ones((n_u, 1)))) G = np.vstack( (np.hstack((np.zeros((n_u, d)), -np.eye(n_u), np.zeros( (n_u, n_u)))), np.hstack((0.5 * k_u, -np.eye(n_u), np.zeros((n_u, n_u)))), np.hstack((k_u, -np.eye(n_u), np.zeros((n_u, n_u)))), np.hstack((np.zeros((n_u, d)), np.zeros( (n_u, n_u)), -np.eye(n_u))), np.hstack((-0.5 * k_u, np.zeros((n_u, n_u)), -np.eye(n_u))), np.hstack((-k_u, np.zeros((n_u, n_u)), -np.eye(n_u))))) h = np.vstack((np.zeros((n_u, 1)), -0.5 * np.ones( (n_u, 1)), np.zeros((n_u, 1)), np.zeros((n_u, 1)), -0.5 * np.ones( (n_u, 1)), np.zeros((n_u, 1)))) sol = solvers.qp(matrix(P), matrix(q), matrix(G), matrix(h)) self.coef_ = np.array(sol['x'])[:d]
def fit(self, X: np.ndarray, y: np.ndarray): X, y = check_X_y(X, y, allow_nd=True) check_classification_targets(y) self.n_features_ = X.shape[1] self.classes_, class_counts = np.unique(y, return_counts=True) self.n_classes_ = len(self.classes_) self.priors_ = self.priors or (class_counts / np.sum(class_counts)) X = self._preprocess(X) graph = self.graph if graph is None: if self.structure == 'naive': graph = self._naive_structure() elif self.structure == 'tree': graph = self._chow_liu_structure(X, y) else: raise ValueError(f'Invalid structure type: {self.structure}') self.graph_ = graph self.models_ = [ self._partial_fit(graph_k, X[y == cls]) for graph_k, cls in zip(graph, self.classes_) ] return self
def fit(self, X, y, sample_weight=None): """Fit the estimators. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples,) Target values. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. .. versionadded:: 0.18 Returns ------- self : object """ check_classification_targets(y) if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: raise NotImplementedError('Multilabel and multi-output' ' classification is not supported.') if self.voting not in ('soft', 'hard'): raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)" % self.voting) #self.le_ = LabelEncoder().fit(y) #self.classes_ = self.le_.classes_ #transformed_y = self.le_.transform(y) self.estimators_=[est[1] for est in self.estimators] return self #super().fit(X, transformed_y, sample_weight)
def fit_transformer(self, X, y, epochs = 100, lr = 3e-4): #format y check_classification_targets(y) self.network = keras.Sequential() self.network.add(layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu', input_shape=np.shape(X)[1:])) self.network.add(layers.BatchNormalization()) self.network.add(layers.Conv2D(filters=32, kernel_size=(3, 3), strides = 2, padding = "same", activation='relu')) self.network.add(layers.BatchNormalization()) self.network.add(layers.Conv2D(filters=64, kernel_size=(3, 3), strides = 2, padding = "same", activation='relu')) self.network.add(layers.BatchNormalization()) self.network.add(layers.Conv2D(filters=128, kernel_size=(3, 3), strides = 2, padding = "same", activation='relu')) self.network.add(layers.BatchNormalization()) self.network.add(layers.Conv2D(filters=254, kernel_size=(3, 3), strides = 2, padding = "same", activation='relu')) self.network.add(layers.Flatten()) self.network.add(layers.Dense(2000, activation='relu')) self.network.add(layers.Dense(2000, activation='relu')) self.network.add(layers.Dense(units=len(np.unique(y)), activation = 'softmax')) self.network.compile(loss = 'categorical_crossentropy', metrics=['acc'], optimizer = keras.optimizers.Adam(lr)) self.network.fit(X, keras.utils.to_categorical(y), epochs = epochs, callbacks = [EarlyStopping(patience = 4, monitor = "val_acc")], verbose = self.verbose, validation_split = .33) self.encoder = keras.models.Model(inputs = self.network.inputs, outputs = self.network.layers[-2].output) #make sure to flag that we're fit self.transformer_fitted_ = True
def fit(self, X: pd.DataFrame, y: pd.Series): """ Learn the mean target value per category or bin. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. y : pandas series of shape = [n_samples,] The target variable. """ check_classification_targets(y) self.classes_ = unique_labels(y) # check that y is binary if len(self.classes_) > 2: raise NotImplementedError( "This classifier is designed for binary classification only. " "The target has more than 2 unique values.") # if target has values other than 0 and 1, we need to remap the values, # to be able to compute meaningful averages. if any(x for x in self.classes_ if x not in [0, 1]): y = np.where(y == unique_labels(y)[0], 0, 1) return super().fit(X, y)
def fit(self, X, y): """Fit the model according to the given training data and parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y : array, shape = [n_samples] Target values. """ X, y = check_X_y(X, y) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) n_samples, n_features = X.shape n_classes = len(self.classes_) _, counts = np.unique(y, return_counts=True) if np.any(counts > n_features): warnings.warn("Found some classes with more counts than input " "features. Results may be unstable.") self.hat_ = [] for ind in range(n_classes): Xg = X[y == ind, :] Gg = np.dot(Xg, Xg.T) self.hat_.append(np.dot(np.dot(Xg.T, np.linalg.inv(Gg)), Xg)) return self
def fit(self, X, y): """Fit underlying estimators. Parameters ---------- X : (sparse) array-like, shape = [n_samples, n_features] Data. y : array-like, shape = [n_samples] Multi-class targets. Returns ------- self """ X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) check_classification_targets(y) self.classes_ = np.unique(y) if len(self.classes_) == 1: raise ValueError("OneVsOneClassifier can not be fit when only one" " class is present.") n_classes = self.classes_.shape[0] estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs)( delayed(_fit_ovo_binary) (self.estimator, X, y, self.classes_[i], self.classes_[j]) for i in range(n_classes) for j in range(i + 1, n_classes))))) self.estimators_ = estimators_indices[0] # try: # self.pairwise_indices_ = ( # estimators_indices[1] if self._pairwise else None) # except AttributeError: # self.pairwise_indices_ = None return estimators_indices
def fit(self, X, y): """Fit classifier. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # Validate inputs X and y X, y = check_X_y(X, y) X = check_array(X) check_classification_targets(y) self._classes = len(np.unique(y)) if self.pre_fitted: print("Training skipped") return else: for clf in self.classifiers: clf.fit(X, y) clf.fitted_ = True return
def fit(self, X, y, sample_weight=None): """Fit the estimators. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vectors, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,) Target values. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. Returns ------- self : object """ check_classification_targets(y) self._le = LabelEncoder().fit(y) self.classes_ = self._le.classes_ return super().fit(X, self._le.transform(y), sample_weight)
def _validate_y(self, y): """Validate the label vector.""" y = column_or_1d(y, warn=True) check_classification_targets(y) return y
def fit(self,X,y): ''' Fits Logistic Regression with ARD Parameters ---------- X: array-like of size [n_samples, n_features] Training data, matrix of explanatory variables y: array-like of size [n_samples] Target values Returns ------- self : object Returns self. ''' X, y = check_X_y(X, y, accept_sparse = None, dtype=np.float64) n_samples, n_features = X.shape # preprocess features self._X_mean = np.zeros(n_features) self._X_std = np.ones(n_features) if self.normalize: self._X_mean, self._X_std = np.mean(X,0), np.std(X,0) X = (X - self._X_mean) / self._X_std if self.fit_intercept: X = np.concatenate((np.ones([n_samples,1]),X),1) n_features += 1 # preprocess targets check_classification_targets(y) self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("Need samples of at least 2 classes" " in the data, but the data contains only one" " class: %r" % self.classes_[0]) # if multiclass use OVR (i.e. fit classifier for each class) self.coef_,self.active_ ,self.lambda_= list(),list(),list() self.intercept_, self.sigma_ = list(),list() for pos_class in self.classes_: if n_classes == 2: pos_class = self.classes_[1] mask = (y == pos_class) y_bin = np.zeros(y.shape, dtype=np.float64) y_bin[mask] = 1 coef_, intercept_, active_ , sigma_ , A = self._fit(X,y_bin, n_samples,n_features) self.coef_.append(coef_) self.active_.append(active_) self.intercept_.append(intercept_) self.sigma_.append(sigma_) self.lambda_.append(A) # in case of binary classification fit only one classifier if n_classes == 2: break return self
def get_model(self, X, y): if not isinstance(X, (KDTree, BallTree)): X, y = check_X_y(X, y, "csr", multi_output=True) if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: if y.ndim != 1: warnings.warn("A column-vector y was passed when a 1d array " "was expected. Please change the shape of y to " "(n_samples, ), for example using ravel().", DataConversionWarning, stacklevel=2) self.outputs_2d_ = False y = y.reshape((-1, 1)) else: self.outputs_2d_ = True check_classification_targets(y) self.classes_ = [] self._y = np.empty(y.shape, dtype=np.int) for k in range(self._y.shape[1]): classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes) if not self.outputs_2d_: self.classes_ = self.classes_[0] self._y = self._y.ravel() return self._fit(X)
def fit(self, X, y): ''' Fits Logistic Regression with ARD Parameters ---------- X: array-like of size [n_samples, n_features] Training data, matrix of explanatory variables y: array-like of size [n_samples] Target values Returns ------- self : object Returns self. ''' X, y = check_X_y(X, y, accept_sparse=None, dtype=np.float64) n_samples, n_features = X.shape # preprocess features self._X_mean = np.zeros(n_features) self._X_std = np.ones(n_features) if self.normalize: self._X_mean, self._X_std = np.mean(X, 0), np.std(X, 0) X = (X - self._X_mean) / self._X_std if self.fit_intercept: X = np.concatenate((np.ones([n_samples, 1]), X), 1) n_features += 1 # preprocess targets check_classification_targets(y) self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("Need samples of at least 2 classes" " in the data, but the data contains only one" " class: %r" % self.classes_[0]) # if multiclass use OVR (i.e. fit classifier for each class) self.coef_, self.active_, self.lambda_ = list(), list(), list() self.intercept_, self.sigma_ = list(), list() for pos_class in self.classes_: if n_classes == 2: pos_class = self.classes_[1] mask = (y == pos_class) y_bin = np.zeros(y.shape, dtype=np.float64) y_bin[mask] = 1 coef_, intercept_, active_, sigma_, A = self._fit( X, y_bin, n_samples, n_features) self.coef_.append(coef_) self.active_.append(active_) self.intercept_.append(intercept_) self.sigma_.append(sigma_) self.lambda_.append(A) # in case of binary classification fit only one classifier if n_classes == 2: break return self
def fit(self, X, y): """ Fits the transformer to data X with labels y. Parameters ---------- X : ndarray Input data matrix. y : ndarray Output (i.e. response data matrix). """ check_classification_targets(y) _, y = np.unique(y, return_inverse=True) self.num_classes = len(np.unique(y)) # more typechecking self.network.compile(loss=self.loss, optimizer=self.optimizer, **self.compile_kwargs) self.network.fit( X, keras.utils.to_categorical(y, num_classes=self.num_classes), **self.fit_kwargs) self._is_fitted = True return self
def fit(self, X, y): if self.fit_flag_: self._clear_params() X, y = check_X_y(X, y) check_classification_targets(y) self.classes_ = unique_labels(y) self.X_ = X self.y_ = y # setup parameters n_samples, n_features = X.shape if self.w_ is None: self.w0_ = 0 self.w_ = np.zeros(n_features) self.V_ = np.random.normal(0, 0.001, (self.n_factors, n_features)) self._update_class_weight(X, y) self.train_tracker_.start_train() for n_epoch in range(self.epochs): self.train_tracker_.start_epoch(n_epoch) self._train(X, y, n_samples, n_features) self.train_tracker_.end_epoch() self.train_tracker_.end_train() self.fit_flag_ = True return self
def _validate_y(self, y): y = column_or_1d(y, warn=True) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) return y
def fit(self, X, y, sample_weight=None): """ Fit the model. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) Target values. sample_weight, array-like of shape (n_samples,), default=Noone Individual weights for each sample. Returns ------- self """ check_classification_targets(y) if sample_weight is None: self.classes_, self.counts_ = np.unique(y, return_counts=True) else: sample_weight = _check_sample_weight(sample_weight, X) sample_weight = sample_weight / sample_weight.mean() df = pd.DataFrame({'y': y, 'sample_weight': sample_weight}) df = df.groupby('y').sum() self.classes_ = df.index.values self.counts_ = df.sample_weight.values self.counts_ = self.counts_ / self.counts_.sum() self.dominant_class_ = self.classes_[np.argmax(self.counts_)] return self
def _validate_y_class_weight(self, y): check_classification_targets(y) y = np.copy(y) expanded_class_weight = None if self.class_weight is not None: y_original = np.copy(y) self.classes_ = [] self.n_classes_ = [] y_store_unique_indices = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): classes_k, y_store_unique_indices[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_store_unique_indices if self.class_weight is not None: valid_presets = ('auto', 'balanced', 'subsample', 'balanced_subsample') if isinstance(self.class_weight, six.string_types): if self.class_weight not in valid_presets: raise ValueError('Valid presets for class_weight include ' '"balanced" and "balanced_subsample". Given "%s".' % self.class_weight) if self.class_weight == "subsample": warn("class_weight='subsample' is deprecated in 0.17 and" "will be removed in 0.19. It was replaced by " "class_weight='balanced_subsample' using the balanced" "strategy.", DeprecationWarning) if self.warm_start: warn('class_weight presets "balanced" or "balanced_subsample" are ' 'not recommended for warm_start if the fitted data ' 'differs from the full dataset. In order to use ' '"balanced" weights, use compute_class_weight("balanced", ' 'classes, y). In place of y you can use a large ' 'enough sample of the full training set target to ' 'properly estimate the class frequency ' 'distributions. Pass the resulting weights as the ' 'class_weight parameter.') if (self.class_weight not in ['subsample', 'balanced_subsample'] or not self.bootstrap): if self.class_weight == 'subsample': class_weight = 'auto' elif self.class_weight == "balanced_subsample": class_weight = "balanced" else: class_weight = self.class_weight with warnings.catch_warnings(): if class_weight == "auto": warnings.simplefilter('ignore', DeprecationWarning) expanded_class_weight = compute_sample_weight(class_weight, y_original) return y, expanded_class_weight
def test_check_classification_targets(): for y_type in EXAMPLES.keys(): if y_type in ["unknown", "continuous", 'continuous-multioutput']: for example in EXAMPLES[y_type]: msg = 'Unknown label type: ' assert_raises_regex(ValueError, msg, check_classification_targets, example) else: for example in EXAMPLES[y_type]: check_classification_targets(example)
def _validate_y(self, y): y = column_or_1d(y, warn=True) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) n_classes = len(self.classes_) if n_classes > 2: raise ValueError("It's a binary classification algorithm. Use a dataset with only 2 classes to predict.") return y
def fit(self, X, y, sample_weight=None): """ Build a classifier from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values (class labels in classification). sample_weight : array-like, shape = [n_samples] or None Individual weights for each sample. Returns ------- self : object Returns self. """ self._validate_params(**self.get_params()) X, y = check_X_y(X, y, accept_sparse=True) if sp.isspmatrix(X): self._is_sparse_train_X = True else: self._is_sparse_train_X = False self._n_samples, self._n_features = X.shape sample_weight = self._get_sample_weight(sample_weight) check_consistent_length(X, y, sample_weight) check_classification_targets(y) self._classes = sorted(np.unique(y)) self._n_classes = len(self._classes) self._classes_map = {} self._set_params_with_dependencies() params = self._get_params() if self._n_classes == 2: self._classes_map[0] = self._classes[0] self._classes_map[1] = self._classes[1] self._estimators = [None] y = (y == self._classes[0]).astype(int) self._fit_binary_task(X, y, sample_weight, params) elif self._n_classes > 2: if sp.isspmatrix_dok(X): X = X.tocsr().tocoo() # Fix to avoid scipy 7699 issue self._estimators = [None] * self._n_classes self._fit_multiclass_task(X, y, sample_weight, params) else: raise ValueError("Classifier can't predict when only one class is present.") self._fitted = True return self
def fit(self,X,y): ''' Fits Bayesian Logistic Regression Parameters ----------- X: array-like of size (n_samples, n_features) Training data, matrix of explanatory variables y: array-like of size (n_samples, ) Target values Returns ------- self: object self ''' # preprocess data X,y = check_X_y( X, y , dtype = np.float64) check_classification_targets(y) self.classes_ = np.unique(y) n_classes = len(self.classes_) # prepare for ovr if required n_samples, n_features = X.shape if self.fit_intercept: X = self._add_intercept(X) if n_classes < 2: raise ValueError("Need samples of at least 2 classes") if n_classes > 2: self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes self.intercept_ = [0]*n_classes else: self.coef_, self.sigma_, self.intercept_ = [0],[0],[0] # make classifier for each class (one-vs-the rest) for i in range(len(self.coef_)): if n_classes == 2: pos_class = self.classes_[1] else: pos_class = self.classes_[i] mask = (y == pos_class) y_bin = np.ones(y.shape, dtype=np.float64) y_bin[~mask] = self._mask_val coef_, sigma_ = self._fit(X,y_bin) if self.fit_intercept: self.intercept_[i],self.coef_[i] = self._get_intercept(coef_) else: self.coef_[i] = coef_ self.sigma_[i] = sigma_ self.coef_ = np.asarray(self.coef_) return self
def fit(self,X,y): ''' Fits Bayesian Logistic Regression with Laplace approximation Parameters ----------- X: array-like of size [n_samples, n_features] Training data, matrix of explanatory variables y: array-like of size [n_samples, n_features] Target values Returns ------- self: object self ''' # preprocess data X,y = check_X_y( X, y , dtype = np.float64) check_classification_targets(y) self.classes_ = np.unique(y) n_classes = len(self.classes_) # take into account bias term if required n_samples, n_features = X.shape n_features = n_features + int(self.fit_intercept) if n_classes < 2: raise ValueError("Need samples of at least 2 classes") if n_classes > 2: self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes self.intercept_ = [0]*n_classes else: self.coef_, self.sigma_, self.intercept_ = [0],[0],[0] for i in range(len(self.coef_)): w0 = np.zeros(n_features) if n_classes == 2: pos_class = self.classes_[1] else: pos_class = self.classes_[i] mask = (y == pos_class) y_bin = np.ones(y.shape, dtype=np.float64) y_bin[~mask] = -1. coef, sigma_ = self._fit(X,y_bin,w0, self.alpha) if self.fit_intercept: self.intercept_[i] = coef[-1] coef_ = coef[:-1] self.coef_[i] = coef_ self.sigma_[i] = sigma_ self.coef_ = np.asarray(self.coef_) return self
def _prepare(self,X,Y): '''preprocess data before training''' check_classification_targets(Y) self.classes_ = np.unique(Y) if len(self.classes_) < 2: raise ValueError("The number of classes has to be almost 2; got ", len(self.classes_)) self.multiclass_ = len(self.classes_) > 2 KL = process_list(X,self.generator) # X can be a samples matrix or Kernels List self.KL, self.Y = check_KL_Y(KL,Y) self.n_kernels = len(self.KL) return
def _encode_y(self, y): # encode classes into 0 ... n_classes - 1 and sets attributes classes_ # and n_trees_per_iteration_ check_classification_targets(y) label_encoder = LabelEncoder() encoded_y = label_encoder.fit_transform(y) self.classes_ = label_encoder.classes_ n_classes = self.classes_.shape[0] # only 1 tree for binary classification. For multiclass classification, # we build 1 tree per class. self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes encoded_y = encoded_y.astype(Y_DTYPE, copy=False) return encoded_y
def fit(self, X, y): check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia']) check_array(X, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia']) self.X_ = X check_classification_targets(y) classes = np.nonzero(y) n_samples, n_classes = len(y), len(classes) # create diagonal matrix of degree of nodes if sparse.isspmatrix(self.X_): B_ = self.X_.copy().astype(np.float) D = np.array(csr_matrix.sum(self.X_, axis=1), dtype=np.float).T[0] else: B_ = np.copy(self.X_).astype(np.float) D = np.array(np.sum(self.X_, axis=1), dtype=np.float) # if (- self.sigma) and (self.sigma - 1) doesn't equals we have different diagonal matrix at the left and right sides if (- self.sigma) == (self.sigma - 1): D_left = D_right = np.power(D, - self.sigma) else: D_left = np.power(D, - self.sigma) D_right = np.power(self.sigma - 1) # M_ = D_left.dot(B_) for i, d in enumerate(D_left): B_[i, :] *= d # B_ = M_.dot(D_right) for i, d in enumerate(D_right): B_[:, i] *= d # create labeled data Z dimension = (n_samples, n_classes) labels = np.nonzero(y) ans_y = np.zeros(dimension) for l in labels[0]: ans_y[l][y[l] - 1] = 1 Z_ = (self.sigma / (1 + self.sigma)) * ans_y self.initial_vector_ = np.ones(dimension) / n_classes self._get_method_(B_, Z_) return self
def _set_n_classes(self, y): """Set the number of classes if `y` is presented, which is not expected. It could be useful for multi-class outlier detection. Parameters ---------- y : numpy array of shape (n_samples,) Ground truth. Returns ------- self """ self._classes = 2 # default as binary classification if y is not None: check_classification_targets(y) self._classes = len(np.unique(y)) warnings.warn( "y should not be presented in unsupervised learning.") return self
def fit_resample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {array-like, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : array-like, shape (n_samples_new,) The corresponding label of `X_resampled`. """ self._deprecate_ratio() check_classification_targets(y) X, y, binarize_y = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type) output = self._fit_resample(X, y) if binarize_y: y_sampled = label_binarize(output[1], np.unique(y)) if len(output) == 2: return output[0], y_sampled return output[0], y_sampled, output[2] return output
def fit(self, X, Y): """Fit the model according to the given training data Parameters ---------- X : array-like, shape = [n_samples, n_features] Matrix of the examples, where n_samples is the number of samples and n_feature is the number of features Y : array-like, shape = [n_samples] array of the labels relative to X Returns ------- self : object Returns self """ X,Y = validation.check_X_y(X, Y, dtype=np.float64, order='C', accept_sparse='csr') #check_consistent_length(X,Y) check_classification_targets(Y) self.classes_ = np.unique(Y) if len(self.classes_) < 2: raise ValueError("The number of classes has to be almost 2; got ", len(self.classes_)) if len(self.classes_) == 2: self.multiclass_ = False return self._fit(X,Y) else : self.multiclass_ = True if self.multiclass_strategy == 'ovo': return self._one_vs_one(X,Y) else : return self._one_vs_rest(X,Y) raise ValueError('This is a very bad exception...')
def fit_resample(self, X, y): check_classification_targets(y) self.fit(X, y) return X, y
def _fit(self, X, y, sample_weight=None, relative_penalties=None): if self.lambda_path is not None: n_lambda = len(self.lambda_path) min_lambda_ratio = 1.0 else: n_lambda = self.n_lambda min_lambda_ratio = self.min_lambda_ratio check_classification_targets(y) self.classes_ = np.unique(y) # the output of np.unique is sorted n_classes = len(self.classes_) if n_classes < 2: raise ValueError("Training data need to contain at least 2 " "classes.") # glmnet requires the labels a one-hot-encoded array of # (n_samples, n_classes) if n_classes == 2: # Normally we use 1/0 for the positive and negative classes. Since # np.unique sorts the output, the negative class will be in the 0th # column. We want a model predicting the positive class, not the # negative class, so we flip the columns here (the != condition). # # Broadcast comparison of self.classes_ to all rows of y. See the # numpy rules on broadcasting for more info, essentially this # "reshapes" y to (n_samples, n_classes) and self.classes_ to # (n_samples, n_classes) and performs an element-wise comparison # resulting in _y with shape (n_samples, n_classes). _y = (y[:, None] != self.classes_).astype(np.float64, order='F') else: # multinomial case, glmnet uses the entire array so we can # keep the original order. _y = (y[:, None] == self.classes_).astype(np.float64, order='F') # use sample weights, making sure all weights are positive # this is inspired by the R wrapper for glmnet, in lognet.R if sample_weight is not None: weight_gt_0 = sample_weight > 0 sample_weight = sample_weight[weight_gt_0] _y = _y[weight_gt_0, :] X = X[weight_gt_0, :] _y = _y * np.expand_dims(sample_weight, 1) # we need some sort of "offset" array for glmnet # an array of shape (n_examples, n_classes) offset = np.zeros((X.shape[0], n_classes), dtype=np.float64, order='F') # You should have thought of that before you got here. exclude_vars = 0 # how much each feature should be penalized relative to the others # this may be useful to expose to the caller if there are vars that # must be included in the final model or there is some prior knowledge # about how important some vars are relative to others, see the glmnet # vignette: # http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html if relative_penalties is None: relative_penalties = np.ones(X.shape[1], dtype=np.float64, order='F') coef_bounds = np.empty((2, X.shape[1]), dtype=np.float64, order='F') coef_bounds[0, :] = self.lower_limits coef_bounds[1, :] = self.upper_limits if n_classes == 2: # binomial, tell glmnet there is only one class # otherwise we will get a coef matrix with two dimensions # where each pair are equal in magnitude and opposite in sign # also since the magnitudes are constrained to sum to one, the # returned coefficients would be one half of the proper values n_classes = 1 # This is a stopping criterion (nx) # R defaults to nx = num_features, and ne = num_features + 1 if self.max_features is None: max_features = X.shape[1] else: max_features = self.max_features # for documentation on the glmnet function lognet, see doc.py if issparse(X): _x = csc_matrix(X, dtype=np.float64, copy=True) (self.n_lambda_, self.intercept_path_, ca, ia, nin, _, # dev0 _, # dev self.lambda_path_, _, # nlp jerr) = splognet(self.alpha, _x.shape[0], _x.shape[1], n_classes, _x.data, _x.indptr + 1, # Fortran uses 1-based indexing _x.indices + 1, _y, offset, exclude_vars, relative_penalties, coef_bounds, max_features, X.shape[1] + 1, min_lambda_ratio, self.lambda_path, self.tol, n_lambda, self.standardize, self.fit_intercept, self.max_iter, 0) else: # not sparse # some notes: glmnet requires both x and y to be float64, the two # arrays # may also be overwritten during the fitting process, so they need # to be copied prior to calling lognet. The fortran wrapper will # copy any arrays passed to a wrapped function if they are not in # the fortran layout, to avoid making extra copies, ensure x and y # are `F_CONTIGUOUS` prior to calling lognet. _x = X.astype(dtype=np.float64, order='F', copy=True) (self.n_lambda_, self.intercept_path_, ca, ia, nin, _, # dev0 _, # dev self.lambda_path_, _, # nlp jerr) = lognet(self.alpha, n_classes, _x, _y, offset, exclude_vars, relative_penalties, coef_bounds, X.shape[1] + 1, min_lambda_ratio, self.lambda_path, self.tol, max_features, n_lambda, self.standardize, self.fit_intercept, self.max_iter, 0) # raises RuntimeError if self.jerr_ is nonzero self.jerr_ = jerr _check_error_flag(self.jerr_) # glmnet may not return the requested number of lambda values, so we # need to trim the trailing zeros from the returned path so # len(lambda_path_) is equal to n_lambda_ self.lambda_path_ = self.lambda_path_[:self.n_lambda_] # also fix the first value of lambda self.lambda_path_ = _fix_lambda_path(self.lambda_path_) self.intercept_path_ = self.intercept_path_[:, :self.n_lambda_] # also trim the compressed coefficient matrix ca = ca[:, :, :self.n_lambda_] # and trim the array of n_coef per lambda (may or may not be non-zero) nin = nin[:self.n_lambda_] # decompress the coefficients returned by glmnet, see doc.py self.coef_path_ = lsolns(X.shape[1], ca, ia, nin) # coef_path_ has shape (n_features, n_classes, n_lambda), we should # match shape for scikit-learn models: # (n_classes, n_features, n_lambda) self.coef_path_ = np.transpose(self.coef_path_, axes=(1, 0, 2)) return self
def fit(self, X, y, sample_weight=None): """ Build a RGF Classifier from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values (class labels in classification). sample_weight : array-like, shape = [n_samples] or None Individual weights for each sample. Returns ------- self : object Returns self. """ _validate_params(**self.get_params()) X, y = check_X_y(X, y, accept_sparse=True) n_samples, self._n_features = X.shape if self.sl2 is None: self._sl2 = self.l2 else: self._sl2 = self.sl2 if isinstance(self.min_samples_leaf, _FLOATS): self._min_samples_leaf = ceil(self.min_samples_leaf * n_samples) else: self._min_samples_leaf = self.min_samples_leaf if self.n_iter is None: if self.loss == "LS": self._n_iter = 10 else: self._n_iter = 5 else: self._n_iter = self.n_iter if sample_weight is None: sample_weight = np.ones(n_samples, dtype=np.float32) else: sample_weight = column_or_1d(sample_weight, warn=True) if (sample_weight <= 0).any(): raise ValueError("Sample weights must be positive.") check_consistent_length(X, y, sample_weight) check_classification_targets(y) self._classes = sorted(np.unique(y)) self._n_classes = len(self._classes) self._classes_map = {} params = dict(max_leaf=self.max_leaf, test_interval=self.test_interval, algorithm=self.algorithm, loss=self.loss, reg_depth=self.reg_depth, l2=self.l2, sl2=self._sl2, normalize=self.normalize, min_samples_leaf=self._min_samples_leaf, n_iter=self._n_iter, n_tree_search=self.n_tree_search, opt_interval=self.opt_interval, learning_rate=self.learning_rate, memory_policy=self.memory_policy, verbose=self.verbose) if self._n_classes == 2: self._classes_map[0] = self._classes[0] self._classes_map[1] = self._classes[1] self._estimators = [None] y = (y == self._classes[0]).astype(int) self._estimators[0] = _RGFBinaryClassifier(**params) self._estimators[0].fit(X, y, sample_weight) elif self._n_classes > 2: if sp.isspmatrix_dok(X): X = X.tocsr().tocoo() # Fix to avoid scipy 7699 issue self._estimators = [None] * self._n_classes ovr_list = [None] * self._n_classes for i, cls_num in enumerate(self._classes): self._classes_map[i] = cls_num ovr_list[i] = (y == cls_num).astype(int) self._estimators[i] = _RGFBinaryClassifier(**params) self._estimators = Parallel(n_jobs=self.n_jobs)(delayed(_fit_ovr_binary)(self._estimators[i], X, ovr_list[i], sample_weight) for i in range(self._n_classes)) else: raise ValueError("Classifier can't predict when only one class is present.") self._fitted = True return self
def fit(self,X,y): ''' Fits Logistic Regression with ARD Parameters ---------- X: array-like of size [n_samples, n_features] Training data, matrix of explanatory variables y: array-like of size [n_samples] Target values Returns ------- self : object Returns self. ''' X, y = check_X_y(X, y, accept_sparse = None, dtype=np.float64) # normalize, if required if self.normalize: self._x_mean = np.mean(X,0) self._x_std = np.std(X,0) X = (X - self._x_mean) / self._x_std # add bias term if required if self.fit_intercept: X = np.concatenate((np.ones([X.shape[0],1]),X),1) # preprocess targets check_classification_targets(y) self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("Need samples of at least 2 classes" " in the data, but the data contains only one" " class: %r" % self.classes_[0]) # if multiclass use OVR (i.e. fit classifier for each class) if n_classes < 2: raise ValueError("Need samples of at least 2 classes") if n_classes > 2: self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes self.intercept_ , self.active_ = [0]*n_classes, [0]*n_classes self.lambda_ = [0]*n_classes else: self.coef_, self.sigma_, self.intercept_,self.active_ = [0],[0],[0],[0] self.lambda_ = [0] for i in xrange(len(self.classes_)): if n_classes == 2: pos_class = self.classes_[1] else: pos_class = self.classes_[i] mask = (y == pos_class) y_bin = np.zeros(y.shape, dtype=np.float64) y_bin[mask] = 1 coef,bias,active,sigma,lambda_ = self._fit(X,y_bin) self.coef_[i], self.intercept_[i], self.sigma_[i] = coef, bias, sigma self.active_[i], self.lambda_[i] = active, lambda_ # in case of binary classification fit only one classifier if n_classes == 2: break self.coef_ = np.asarray(self.coef_) self.intercept_ = np.asarray(self.intercept_) return self
def fit(self, X, y): """Fit a semi-supervised label propagation model based All the input data is provided matrix X (labeled and unlabeled) and corresponding label matrix y with a dedicated marker value for unlabeled samples. Parameters ---------- X : array-like, shape = [n_samples, n_features] A {n_samples by n_samples} size matrix will be created from this y : array_like, shape = [n_samples] n_labeled_samples (unlabeled points are marked as -1) All unlabeled samples will be transductively assigned labels Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y) self.X_ = X check_classification_targets(y) # actual graph construction (implementations should override this) graph_matrix = self._build_graph() # label construction # construct a categorical distribution for classification only classes = np.unique(y) classes = (classes[classes != -1]) self.classes_ = classes n_samples, n_classes = len(y), len(classes) y = np.asarray(y) unlabeled = y == -1 clamp_weights = np.ones((n_samples, 1)) clamp_weights[~unlabeled, 0] = 1 - self.alpha # initialize distributions self.label_distributions_ = np.zeros((n_samples, n_classes)) for label in classes: self.label_distributions_[y == label, classes == label] = 1 y_static = np.copy(self.label_distributions_) if self.alpha > 0.: y_static *= self.alpha y_static[unlabeled] = 0 l_previous = np.zeros((self.X_.shape[0], n_classes)) remaining_iter = self.max_iter if sparse.isspmatrix(graph_matrix): graph_matrix = graph_matrix.tocsr() while (_not_converged(self.label_distributions_, l_previous, self.tol) and remaining_iter > 1): l_previous = self.label_distributions_ self.label_distributions_ = safe_sparse_dot( graph_matrix, self.label_distributions_) # clamp self.label_distributions_ = np.multiply( clamp_weights, self.label_distributions_) + y_static remaining_iter -= 1 normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer if remaining_iter <= 1: warnings.warn('max_iter was reached without convergence.', category=ConvergenceWarning) # set the transduction item transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)] self.transduction_ = transduction.ravel() self.n_iter_ = self.max_iter - remaining_iter return self