def fit(self, X, y=None): """Compute the minimum and maximum to be used for later scaling. Parameters ---------- X : array-like, shape [n_samples, n_features] The data used to compute the per-feature minimum and maximum used for later scaling along the features axis. """ X = check_arrays(X, sparse_format="csc", copy=self.copy)[0] warn_if_not_float(X, estimator=self) feature_range = self.feature_range if feature_range[0] >= feature_range[1]: raise ValueError("Minimum of desired feature range must be smaller" " than maximum. Got %s." % str(feature_range)) if sparse.issparse(X): data_min = [] data_max = [] data_range = [] for i in range(X.shape[1]): if X.indptr[i] == X.indptr[i + 1]: data_min.append(0) data_max.append(0) data_range.append(0) else: data_min.append(X.data[X.indptr[i]:X.indptr[i + 1]].min()) data_max.append(X.data[X.indptr[i]:X.indptr[i + 1]].max()) data_min = np.array(data_min) data_max = np.array(data_max) data_range = data_max - data_min else: data_min = np.min(X, axis=0) data_range = np.max(X, axis=0) - data_min # Do not scale constant features if isinstance(data_range, np.ndarray): # For a sparse matrix, constant features will be set to one! if sparse.issparse(X): for i in range(len(data_min)): if data_range[i] == 0.0: data_min[i] = data_min[i] - 1 data_range[data_range == 0.0] = 1.0 elif data_range == 0.: data_range = 1. self.scale_ = (feature_range[1] - feature_range[0]) / data_range self.min_ = feature_range[0] - data_min * self.scale_ self.data_range = data_range self.data_min = data_min return self
def transform(self, X, y=None, copy=None): """Perform standardization by centering and scaling Parameters ---------- X : array-like with shape [n_samples, n_features] The data used to scale along the features axis. """ copy = copy if copy is not None else self.copy X = check_arrays(X, copy=copy, sparse_format="csc")[0] if warn_if_not_float(X, estimator=self): X = X.astype(np.float) if sparse.issparse(X): if self.center_sparse: for i in range(X.shape[1]): X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i] elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: pass if self.std_ is not None: inplace_column_scale(X, 1 / self.std_) else: if self.with_mean: X -= self.mean_ if self.with_std: X /= self.std_ return X
def fit(self, X, y=None): """Don't trust the documentation of this module! Compute the mean and std to be used for later scaling. Parameters ---------- X : array-like or CSR matrix with shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. """ X = check_arrays(X, copy=self.copy, sparse_format="csc")[0] if warn_if_not_float(X, estimator=self): X = X.astype(np.float) if sparse.issparse(X): if self.center_sparse: means = [] vars = [] # This only works for csc matrices... for i in range(X.shape[1]): if X.indptr[i] == X.indptr[i + 1]: means.append(0) vars.append(1) else: vars.append(X.data[X.indptr[i]:X.indptr[i + 1]].var()) # If the variance is 0, set all occurences of this # features to 1 means.append(X.data[X.indptr[i]:X.indptr[i + 1]].mean()) if 0.0000001 >= vars[-1] >= -0.0000001: means[-1] -= 1 self.std_ = np.sqrt(np.array(vars)) self.std_[np.array(vars) == 0.0] = 1.0 self.mean_ = np.array(means) return self elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: self.mean_ = None if self.with_std: var = mean_variance_axis0(X)[1] self.std_ = np.sqrt(var) self.std_[var == 0.0] = 1.0 else: self.std_ = None return self else: self.mean_, self.std_ = _mean_and_std(X, axis=0, with_mean=self.with_mean, with_std=self.with_std) return self
def fit(self, X, y=None): """Compute the minimum and maximum to be used for later scaling. Parameters ---------- X : array-like, shape [n_samples, n_features] The data used to compute the per-feature minimum and maximum used for later scaling along the features axis. """ X = check_array(X, copy=self.copy) warn_if_not_float(X, estimator=self) feature_range = self.feature_range if feature_range[0] >= feature_range[1]: raise ValueError("Minimum of desired feature range must be smaller" " than maximum. Got %s." % str(feature_range)) if self.fit_feature_range is not None: fit_feature_range = self.fit_feature_range if fit_feature_range[0] >= fit_feature_range[1]: raise ValueError("Minimum of desired (fit) feature range must " "be smaller than maximum. Got %s." % str(feature_range)) if (fit_feature_range[0] < feature_range[0] or fit_feature_range[1] > feature_range[1]): raise ValueError("fit_feature_range must be a subset of " "feature_range. Got %s, fit %s." % (str(feature_range), str(fit_feature_range))) feature_range = fit_feature_range data_min = np.min(X, axis=0) data_range = np.max(X, axis=0) - data_min # Do not scale constant features data_range[data_range == 0.0] = 1.0 self.scale_ = (feature_range[1] - feature_range[0]) / data_range self.min_ = feature_range[0] - data_min * self.scale_ self.data_range = data_range self.data_min = data_min return self
def normalize(self, X, norm='l2', axis=1, copy=True): """Normalize a dataset along any axis Parameters ---------- X : array or scipy.sparse matrix with shape [n_samples, n_features] The data to normalize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. norm : 'l1' or 'l2', optional ('l2' by default) The norm to use to normalize each non zero sample (or each non-zero feature if axis is 0). axis : 0 or 1, optional (1 by default) axis used to normalize the data along. If 1, independently normalize each sample, otherwise (if 0) normalize each feature. copy : boolean, optional, default is True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix and if axis is 1). See also -------- :class:`sklearn.preprocessing.Normalizer` to perform normalization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`) """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm" % norm) if axis == 0: sparse_format = 'csc' elif axis == 1: sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % axis) X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0] warn_if_not_float(X, 'The normalize function') if axis == 0: X = X.T if sparse.issparse(X): if norm == 'l1': inplace_csr_row_normalize_l1(X) elif norm == 'l2': inplace_csr_row_normalize_l2(X) else: if norm == 'l1': norms = np.abs(X).sum(axis=1) norms[norms == 0.0] = 1.0 elif norm == 'l2': norms = row_norms(X) norms[norms == 0.0] = 1.0 X /= norms[:, np.newaxis] if axis == 0: X = X.T return X
def fit(self, X, y=None): """Compute the Box-Cox lambda value to be used for later scaling. Parameters ---------- X : array-like, shape [n_samples, n_features] The data used to compute the per-feature minimum and maximum used for later scaling along the features axis. """ X = check_array(X, copy=self.copy, ensure_2d=True) warn_if_not_float(X, estimator=self) # Take the minimum of each feature data_min = np.min(X, axis=0, keepdims=True) # Sanity check if self.known_min is not None and np.any(self.known_min > data_min): raise Warning("The minimum of the data is less than the supplied" " 'known' minimum value.") if self.known_min is not None: data_min = np.minimum(data_min, self.known_min) else: # Since the user didn't know how negative the values could be, # let's err on the side of caution a little bit data_min = data_min*2 # Note: this has no effect is the data is non-negative. # Need to offset by the negative of the minima so all values are +ve offset = -data_min # And we need to ensure 0 gets mapped correctly offset[offset >= 0] += 1 # We want to change inputs which are always +ve offset = np.maximum(offset, 0) # Store this array of feature offsets self.offset_ = offset # Apply the offset to the raw data X += self.offset_ # Find the optimal Box-Cox transform for each feature n_samples = X.shape[0] n_features = X.shape[1] lambda_values = np.zeros((n_features)) Xt = np.zeros(X.shape) for i in range(n_features): # Fit the BoxCox transform to the data in this column Xt[:,i], lambda_values[i] = stats.boxcox(X[:,i]) # Sanity check: # Make sure the transformed values are not all the same if the original # data wasn't like that # (this is a bug which can happen if lambda was chosen badly by scipy) if not np.allclose(X[:,i], X[0,i]*np.ones(n_samples)) and np.allclose(Xt[:,i], Xt[0,i]*np.ones(n_samples)): raise ValueError("Lambda was badly chosen for feature {}. Values became singular!".format(i)) # We should fix this issue by finding a better lambda value ourselves # Store the lambda value self.lambda_ = lambda_values # Fit the to z-score with standard scaler if self.standardise: self.standardiser = StandardScaler() # Fit on the transformed data self.standardiser.fit(Xt, y) return self
def normalize(X, norm='l2', axis=1, copy=True): """Scale input vectors individually to unit norm (vector length). Parameters ---------- X : array or scipy.sparse matrix with shape [n_samples, n_features] The data to normalize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. norm : 'l1' or 'l2', optional ('l2' by default) The norm to use to normalize each non zero sample (or each non-zero feature if axis is 0). axis : 0 or 1, optional (1 by default) axis used to normalize the data along. If 1, independently normalize each sample, otherwise (if 0) normalize each feature. copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix and if axis is 1). See also -------- :class:`sklearn.preprocessing.Normalizer` to perform normalization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`) """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm" % norm) if axis == 0: sparse_format = 'csc' elif axis == 1: sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % axis) X = check_array(X, sparse_format, copy=copy) warn_if_not_float(X, 'The normalize function') if axis == 0: X = X.T if sparse.issparse(X): X = check_array(X, accept_sparse=sparse_format, dtype=np.float64) if norm == 'l1': inplace_csr_row_normalize_l1(X) elif norm == 'l2': inplace_csr_row_normalize_l2(X) else: if norm == 'l1': norms = np.abs(X).sum(axis=1) norms[norms == 0.0] = 1.0 elif norm == 'l2': norms = row_norms(X) norms[norms == 0.0] = 1.0 X /= norms[:, np.newaxis] if axis == 0: X = X.T return X