def fit(self, X, y=None): """Don't trust the documentation of this module! Compute the mean and std to be used for later scaling. Parameters ---------- X : array-like or CSR matrix with shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. """ X = check_array(X, copy=self.copy, accept_sparse="csc", ensure_2d=False) if warn_if_not_float(X, estimator=self): # Costly conversion, but otherwise the pipeline will break: # https://github.com/scikit-learn/scikit-learn/issues/1709 X = X.astype(np.float32) if sparse.issparse(X): if self.center_sparse: means = [] vars = [] # This only works for csc matrices... for i in range(X.shape[1]): if X.indptr[i] == X.indptr[i + 1]: means.append(0) vars.append(1) else: vars.append( X.data[X.indptr[i]:X.indptr[i + 1]].var()) # If the variance is 0, set all occurences of this # features to 1 means.append( X.data[X.indptr[i]:X.indptr[i + 1]].mean()) if 0.0000001 >= vars[-1] >= -0.0000001: means[-1] -= 1 self.std_ = np.sqrt(np.array(vars)) self.std_[np.array(vars) == 0.0] = 1.0 self.mean_ = np.array(means) return self elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: self.mean_ = None if self.with_std: var = mean_variance_axis(X, axis=0)[1] self.std_ = np.sqrt(var) self.std_[var == 0.0] = 1.0 else: self.std_ = None return self else: self.mean_, self.std_ = _mean_and_std( X, axis=0, with_mean=self.with_mean, with_std=self.with_std) return self
def fit(self, X, y=None): """Compute the minimum and maximum to be used for later scaling. Parameters ---------- X : array-like, shape [n_samples, n_features] The data used to compute the per-feature minimum and maximum used for later scaling along the features axis. """ X = check_array(X, copy=self.copy, ensure_2d=True, accept_sparse="csc", dtype=np.float32, ensure_min_samples=2) if warn_if_not_float(X, estimator=self): # Costly conversion, but otherwise the pipeline will break: # https://github.com/scikit-learn/scikit-learn/issues/1709 X = X.astype(np.float) feature_range = self.feature_range if feature_range[0] >= feature_range[1]: raise ValueError("Minimum of desired feature range must be smaller" " than maximum. Got %s." % str(feature_range)) if sparse.issparse(X): data_min = [] data_max = [] data_range = [] for i in range(X.shape[1]): if X.indptr[i] == X.indptr[i + 1]: data_min.append(0) data_max.append(0) data_range.append(0) else: data_min.append(X.data[X.indptr[i]:X.indptr[i + 1]].min()) data_max.append(X.data[X.indptr[i]:X.indptr[i + 1]].max()) data_min = np.array(data_min, dtype=np.float32) data_max = np.array(data_max, dtype=np.float32) data_range = data_max - data_min else: data_min = np.min(X, axis=0) data_range = np.max(X, axis=0) - data_min # Do not scale constant features if isinstance(data_range, np.ndarray): # For a sparse matrix, constant features will be set to one! if sparse.issparse(X): for i in range(len(data_min)): if data_range[i] == 0.0: data_min[i] = data_min[i] - 1 data_range[data_range == 0.0] = 1.0 elif data_range == 0.: data_range = 1. self.scale_ = (feature_range[1] - feature_range[0]) / data_range self.min_ = feature_range[0] - data_min * self.scale_ self.data_range = data_range self.data_min = data_min return self
def transform(self, X, y=None, copy=None): """Perform standardization by centering and scaling Parameters ---------- X : array-like with shape [n_samples, n_features] The data used to scale along the features axis. """ check_is_fitted(self, 'std_') copy = copy if copy is not None else self.copy X = check_array(X, copy=copy, accept_sparse="csc", ensure_2d=False) if warn_if_not_float(X, estimator=self): X = X.astype(np.float) if sparse.issparse(X): if self.center_sparse: for i in range(X.shape[1]): X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i] elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: pass if self.std_ is not None: inplace_column_scale(X, 1 / self.std_) else: if self.with_mean: X -= self.mean_ if self.with_std: X /= self.std_ return X
def fit(self, X, y=None): """Compute the minimum and maximum to be used for later scaling. Parameters ---------- X : array-like, shape [n_samples, n_features] The data used to compute the per-feature minimum and maximum used for later scaling along the features axis. """ X = check_array(X, copy=self.copy, ensure_2d=True, accept_sparse="csc", dtype=np.float32, ensure_min_samples=2) if warn_if_not_float(X, estimator=self): # Costly conversion, but otherwise the pipeline will break: # https://github.com/scikit-learn/scikit-learn/issues/1709 X = X.astype(np.float) feature_range = self.feature_range if feature_range[0] >= feature_range[1]: raise ValueError("Minimum of desired feature range must be smaller" " than maximum. Got %s." % str(feature_range)) if sparse.issparse(X): data_min = [] data_max = [] data_range = [] for i in range(X.shape[1]): if X.indptr[i] == X.indptr[i+1]: data_min.append(0) data_max.append(0) data_range.append(0) else: data_min.append(X.data[X.indptr[i]:X.indptr[i + 1]].min()) data_max.append(X.data[X.indptr[i]:X.indptr[i + 1]].max()) data_min = np.array(data_min, dtype=np.float32) data_max = np.array(data_max, dtype=np.float32) data_range = data_max - data_min else: data_min = np.min(X, axis=0) data_range = np.max(X, axis=0) - data_min # Do not scale constant features if isinstance(data_range, np.ndarray): # For a sparse matrix, constant features will be set to one! if sparse.issparse(X): for i in range(len(data_min)): if data_range[i] == 0.0: data_min[i] = data_min[i] - 1 data_range[data_range == 0.0] = 1.0 elif data_range == 0.: data_range = 1. self.scale_ = (feature_range[1] - feature_range[0]) / data_range self.min_ = feature_range[0] - data_min * self.scale_ self.data_range = data_range self.data_min = data_min return self
def fit(self, X, y=None): """Don't trust the documentation of this module! Compute the mean and std to be used for later scaling. Parameters ---------- X : array-like or CSR matrix with shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. """ X = check_array(X, copy=self.copy, accept_sparse="csc", ensure_2d=False) if warn_if_not_float(X, estimator=self): # Costly conversion, but otherwise the pipeline will break: # https://github.com/scikit-learn/scikit-learn/issues/1709 X = X.astype(np.float32) if sparse.issparse(X): if self.center_sparse: means = [] vars = [] # This only works for csc matrices... for i in range(X.shape[1]): if X.indptr[i] == X.indptr[i + 1]: means.append(0) vars.append(1) else: vars.append(X.data[X.indptr[i]:X.indptr[i + 1]].var()) # If the variance is 0, set all occurences of this # features to 1 means.append(X.data[X.indptr[i]:X.indptr[i + 1]].mean()) if 0.0000001 >= vars[-1] >= -0.0000001: means[-1] -= 1 self.std_ = np.sqrt(np.array(vars)) self.std_[np.array(vars) == 0.0] = 1.0 self.mean_ = np.array(means) return self elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: self.mean_ = None if self.with_std: var = mean_variance_axis(X, axis=0)[1] self.std_ = np.sqrt(var) self.std_[var == 0.0] = 1.0 else: self.std_ = None return self else: self.mean_, self.std_ = _mean_and_std(X, axis=0, with_mean=self.with_mean, with_std=self.with_std) return self
def normalize(X, norm='l2', axis=1, copy=True): """Scale input vectors individually to unit norm (vector length). Parameters ---------- X : array or scipy.sparse matrix with shape [n_samples, n_features] The data to normalize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. norm : 'l1' or 'l2', optional ('l2' by default) The norm to use to normalize each non zero sample (or each non-zero feature if axis is 0). axis : 0 or 1, optional (1 by default) axis used to normalize the data along. If 1, independently normalize each sample, otherwise (if 0) normalize each feature. copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix and if axis is 1). See also -------- :class:`sklearn.preprocessing.Normalizer` to perform normalization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`) """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm" % norm) if axis == 0: sparse_format = 'csc' elif axis == 1: sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % axis) X = check_array(X, sparse_format, copy=copy) warn_if_not_float(X, 'The normalize function') if axis == 0: X = X.T if sparse.issparse(X): X = check_array(X, accept_sparse=sparse_format, dtype=np.float64) if norm == 'l1': inplace_csr_row_normalize_l1(X) elif norm == 'l2': inplace_csr_row_normalize_l2(X) else: if norm == 'l1': norms = np.abs(X).sum(axis=1) norms[norms == 0.0] = 1.0 elif norm == 'l2': norms = row_norms(X) norms[norms == 0.0] = 1.0 X /= norms[:, np.newaxis] if axis == 0: X = X.T return X