def inverse_transform(self, X, copy=None): """Scale back the data to the original representation Parameters ---------- X : array-like, shape [n_samples, n_features] The data used to scale along the features axis. """ check_is_fitted(self, 'scale_') copy = copy if copy is not None else self.copy if not sparse.issparse(X): super(SparseScaler, self).inverse_transform(X, copy=copy) if self.with_mean: raise ValueError( "Cannot uncenter sparse matrices: pass `with_mean=False` " "instead See docstring for motivation and alternatives.") if not sparse.isspmatrix_csr(X): X = X.tocsr() copy = False if copy: X = X.copy() if self.mean_ is not None: sum(X, self.mean_) if self.scale_ is not None: inplace_column_scale(X, self.scale_) return X
def transform(self, X, y=None, copy=None): """Perform standardization by centering and scaling Parameters ---------- X : array-like with shape [n_samples, n_features] The data used to scale along the features axis. """ check_is_fitted(self, 'std_') copy = copy if copy is not None else self.copy X = check_array(X, copy=copy, accept_sparse="csc", ensure_2d=False) if warn_if_not_float(X, estimator=self): X = X.astype(np.float) if sparse.issparse(X): if self.center_sparse: for i in range(X.shape[1]): X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i] elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: pass if self.std_ is not None: inplace_column_scale(X, 1 / self.std_) else: if self.with_mean: X -= self.mean_ if self.with_std: X /= self.std_ return X
def test_inplace_row_scale(): rng = np.random.RandomState(0) X = sp.rand(100, 200, 0.05) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() scale = rng.rand(100) XA *= scale.reshape(-1, 1) inplace_row_scale(Xc, scale) inplace_row_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) with pytest.raises(TypeError): inplace_column_scale(X.tolil(), scale) X = X.astype(np.float32) scale = scale.astype(np.float32) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() XA *= scale.reshape(-1, 1) inplace_row_scale(Xc, scale) inplace_row_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) with pytest.raises(TypeError): inplace_column_scale(X.tolil(), scale)
def scale_array( X, *, zero_center: bool = True, max_value: Optional[float] = None, copy: bool = False, return_mean_std: bool = False, ): if copy: X = X.copy() if not zero_center and max_value is not None: logg.info( # Be careful of what? This should be more specific "... be careful when using `max_value` " "without `zero_center`.") mean, var = _get_mean_var(X) std = np.sqrt(var) if issparse(X): if zero_center: raise ValueError("Cannot zero-center sparse matrix.") sparsefuncs.inplace_column_scale(X, 1 / std) else: X -= mean std[std == 0] = 1e-12 X /= std # do the clipping if max_value is not None: logg.debug(f"... clipping at max_value {max_value}") X[X > max_value] = max_value if return_mean_std: return X, mean, std else: return X
def inverse_transform(self, X, copy=None): """Scale back the data to the original representation Parameters ---------- X : array-like with shape [n_samples, n_features] The data used to scale along the features axis. """ check_is_fitted(self, 'std_') copy = copy if copy is not None else self.copy if sparse.issparse(X): if self.with_mean: raise ValueError( "Cannot uncenter sparse matrices: pass `with_mean=False` " "instead See docstring for motivation and alternatives.") if not sparse.isspmatrix_csr(X): X = X.tocsr() copy = False if copy: X = X.copy() if self.std_ is not None: inplace_column_scale(X, self.std_) else: X = np.asarray(X) if copy: X = X.copy() if self.with_std: X *= self.std_ if self.with_mean: X += self.mean_ return X
def transform(self, X, y=None, copy=None): """Perform standardization by centering and scaling Parameters ---------- X : array-like with shape [n_samples, n_features] The data used to scale along the features axis. """ copy = copy if copy is not None else self.copy X = check_arrays(X, copy=copy, sparse_format="csc")[0] if warn_if_not_float(X, estimator=self): X = X.astype(np.float) if sparse.issparse(X): if self.center_sparse: for i in range(X.shape[1]): X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i] elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: pass if self.std_ is not None: inplace_column_scale(X, 1 / self.std_) else: if self.with_mean: X -= self.mean_ if self.with_std: X /= self.std_ return X
def transform(self, X, y=None, copy=None): """ Perform standardization by centering and scaling using the parameters. :param X: Data matrix to scale. :type X: numpy.ndarray, shape [n_samples, n_features] :param y: Passthrough for scikit-learn ``Pipeline`` compatibility. :type y: None :param bool copy: Copy the X matrix. :return: Scaled version of the X data matrix. :rtype: numpy.ndarray, shape [n_samples, n_features] """ check_is_fitted(self, 'scale_') copy = copy if copy is not None else self.copy X = check_array(X, accept_sparse='csr', copy=copy, estimator=self, dtype=FLOAT_DTYPES) if sparse.issparse(X): if self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") if self.scale_ is not None: inplace_column_scale(X, 1 / self.scale_) else: if self.with_mean: X -= self.mean_ if self.with_std: X /= self.scale_ return X
def inverse_transform(self, X, copy=None): """ Scale back the data to the original representation. :param X: Scaled data matrix. :type X: numpy.ndarray, shape [n_samples, n_features] :param bool copy: Copy the X data matrix. :return: X data matrix with the scaling operation reverted. :rtype: numpy.ndarray, shape [n_samples, n_features] """ check_is_fitted(self, 'scale_') copy = copy if copy is not None else self.copy if sparse.issparse(X): if self.with_mean: raise ValueError( "Cannot uncenter sparse matrices: pass `with_mean=False` " "instead See docstring for motivation and alternatives.") if not sparse.isspmatrix_csr(X): X = X.tocsr() copy = False if copy: X = X.copy() if self.scale_ is not None: inplace_column_scale(X, self.scale_) else: X = numpy.asarray(X) if copy: X = X.copy() if self.with_std: X *= self.scale_ if self.with_mean: X += self.mean_ return X
def inverse_transform(self, X): """Scale back the data to the original representation Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The rescaled data to be transformed back. Returns ------- X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) Transformed array. """ check_is_fitted(self) X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') if sparse.issparse(X): if self.with_scaling: inplace_column_scale(X, self.scale_) else: if self.with_scaling: X *= self.scale_ if self.with_centering: X += self.center_ return X
def transform(self, X): """Center and scale the data. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The data used to scale along the specified axis. Returns ------- X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) Transformed array. """ check_is_fitted(self) X = self._validate_data(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, reset=False, force_all_finite='allow-nan') if sparse.issparse(X): if self.with_scaling: inplace_column_scale(X, 1.0 / self.scale_) else: if self.with_centering: X -= self.center_ if self.with_scaling: X /= self.scale_ return X
def sparse_center_data(X, y, fit_intercept, normalize=False): """ Compute information needed to center data to have mean zero along axis 0. Be aware that X will not be centered since it would break the sparsity, but will be normalized if asked so. """ if fit_intercept: # we might require not to change the csr matrix sometimes # store a copy if normalize is True. # Change dtype to float64 since mean_variance_axis accepts # it that way. if sp.isspmatrix(X) and X.getformat() == 'csr': X = sp.csr_matrix(X, copy=normalize, dtype=np.float64) else: X = sp.csc_matrix(X, copy=normalize, dtype=np.float64) X_offset, X_var = mean_variance_axis(X, axis=0) if normalize: # transform variance to std in-place X_var *= X.shape[0] X_std = np.sqrt(X_var, X_var) del X_var X_std[X_std == 0] = 1 inplace_column_scale(X, 1. / X_std) else: X_std = np.ones(X.shape[1]) y_offset = y.mean(axis=0) y = y - y_offset else: X_offset = np.zeros(X.shape[1]) X_std = np.ones(X.shape[1]) y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype) return X, y, X_offset, y_offset, X_std
def inverse_transform(self, X, copy=None): """Scale back the data to the original representation Parameters ---------- X : array-like with shape [n_samples, n_features] The data used to scale along the features axis. """ copy = copy if copy is not None else self.copy if sparse.issparse(X): if self.with_mean: raise ValueError( "Cannot uncenter sparse matrices: pass `with_mean=False` " "instead See docstring for motivation and alternatives.") if not sparse.isspmatrix_csr(X): X = X.tocsr() copy = False if copy: X = X.copy() if self.std_ is not None: inplace_column_scale(X, self.std_) else: X = np.asarray(X) if copy: X = X.copy() if self.with_std: X *= self.std_ if self.with_mean: X += self.mean_ return X
def normalize_by_umi(matrix): reads_per_bc = matrix.get_reads_per_bc() median_reads_per_bc = np.median(reads_per_bc) scaling_factors = median_reads_per_bc / reads_per_bc # Normalize each barcode's total count by median total count m = matrix.m.copy().astype(np.float64) sparsefuncs.inplace_column_scale(m, scaling_factors) return m
def _scale_precomputed(X, column_means, column_vars, zero_center=True): scale = np.sqrt(column_vars) if zero_center: X -= column_means scale[scale == 0] = 1e-12 X /= scale else: if issparse(X): sparsefuncs.inplace_column_scale(X, 1 / scale) else: X /= scale
def test_inplace_column_scale(): rng = np.random.RandomState(0) X = sp.rand(100, 200, 0.05) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() scale = rng.rand(200) XA *= scale inplace_column_scale(Xc, scale) inplace_column_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale) X = X.astype(np.float32) scale = scale.astype(np.float32) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() XA *= scale inplace_column_scale(Xc, scale) inplace_column_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale)
def test_inplace_column_scale(): rng = np.random.RandomState(0) X = sp.rand(100, 200, 0.05) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() scale = rng.rand(200) XA *= scale inplace_column_scale(Xc, scale) inplace_column_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale)
def _scale(X, zero_center=True): # - using sklearn.StandardScaler throws an error related to # int to long trafo for very large matrices # - using X.multiply is slower # the result differs very slightly, why? if True: mean, var = _get_mean_var(X) scale = np.sqrt(var) if issparse(X): if zero_center: raise ValueError('Cannot zero-center sparse matrix.') sparsefuncs.inplace_column_scale(X, 1/scale) else: X -= mean X /= scale else: from sklearn.preprocessing import StandardScaler scaler = StandardScaler(with_mean=zero_center, copy=False).partial_fit(X) # user R convention (unbiased estimator) scaler.scale_ *= np.sqrt(X.shape[0]/(X.shape[0]-1)) scaler.transform(X)
def compute_sseq_params(x, zeta_quantile=SSEQ_ZETA_QUANTILE): """ Compute global parameters for the sSeq differential expression method. The key parameters are the shrunken feature-wise dispersions. This method was published in: Yu D, et al. (2013) Shrinkage estimation of dispersion in Negative Binomial models for RNA-seq experiments with small sample size. Bioinformatics. 29: 1275-1282. doi: 10.1093/bioinformatics/btt143 Args: x - Sparse matrix (csc) of counts (feature x cell) zeta_quantile (float) - Quantile of method-of-moments dispersion estimates to use as the shrinkage target zeta. Returns: A dictionary containing the sSeq parameters and some diagnostic info. """ # Number of cells N = x.shape[1] # Number of features G = x.shape[0] # Estimate size factors and normalize the matrix for quick mean/var calcs size_factors = estimate_size_factors(x) # Cast to float to prevent truncation of 1 -> 0 for size factors < 1 x_norm = scipy.sparse.csc_matrix(x, dtype=np.float64, copy=True) sparsefuncs.inplace_column_scale(x_norm, 1.0 / size_factors) # Estimate featurewise mean, variance, and dispersion by the method of moments # assuming that each feature follows a negative-binomial distribution. mean_g = np.squeeze(np.asarray(x_norm.mean(axis=1, dtype=np.float64))) # V[X] = E[X^2] - E[X]^2 mean_sq_g = np.squeeze(np.asarray(x_norm.multiply(x_norm).mean(axis=1, dtype=np.float64))) var_g = mean_sq_g - np.square(mean_g) # Method of moments estimate of feature-wise dispersion (phi) # Only use features with non-zero variance in the following estimation use_g = var_g > 0 phi_mm_g = np.zeros(G) phi_mm_g[use_g] = np.maximum(0, (float(N) * var_g[use_g] - mean_g[use_g] * np.sum(1.0 / size_factors)) / (np.square(mean_g[use_g]) * np.sum(1.0 / size_factors))) # Estimate the optimal global target dispersion (zeta_hat). # The true optimal zeta is that which minimizes the MSE vs the true dispersions. # The featurewise dispersions will be "shrunk" towards our estimate of zeta. # Use a high quantile of the MoM dispersion as our shrinkage target # per the rule of thumb in Yu, et al. zeta_hat = np.nanpercentile(phi_mm_g[use_g], 100.0 * zeta_quantile) # Compute delta, the optimal shrinkage towards zeta_hat # This defines a linear function that shrinks the MoM dispersion estimates mean_phi_mm_g = np.mean(phi_mm_g[use_g]) delta = (np.sum(np.square(phi_mm_g[use_g] - mean_phi_mm_g)) / float(G - 1)) / \ (np.sum(np.square(phi_mm_g[use_g] - zeta_hat)) / float(G - 2)) # Compute the shrunken dispersion estimates # Interpolate between the MoM estimates and zeta_hat by delta phi_g = np.full(G, np.nan) if np.any(phi_mm_g[use_g] > 0): phi_g[use_g] = (1 - delta) * phi_mm_g[use_g] + delta * zeta_hat else: phi_g[use_g] = 0.0 return { 'N': N, 'G': G, 'size_factors': size_factors, 'mean_g': mean_g, 'var_g': var_g, 'use_g': use_g, 'phi_mm_g': phi_mm_g, 'eval_zeta': None, 'eval_asd': None, 'asd_slope': None, 'zeta_hat': zeta_hat, 'delta': delta, 'phi_g': phi_g, }
def run_pca(matrix, pca_features=None, pca_bcs=None, n_pca_components=None, random_state=None, min_count_threshold=0): """ Run a PCA on the matrix using the IRLBA matrix factorization algorithm. Prior to the PCA analysis, the matrix is modified so that all barcodes/columns have the same counts, and then the counts are transformed by a log2(1+X) operation. If desired, only a subset of features (e.g. sample rows) can be selected for PCA analysis. Each feature is ranked by its dispersion relative to other features that have a similar mean count. The top `pca_features` as ranked by this method will then be used for the PCA. One can also select to subset number of barcodes to use (e.g. sample columns), but in this case they are simply randomly sampled. Args: matrix (CountMatrix): The matrix to perform PCA on. pca_features (int): Number of features to subset from matrix and use in PCA. The top pca_features ranked by dispersion are used pca_bcs (int): Number of barcodes to randomly sample for the matrix. n_pca_components (int): How many PCA components should be used. random_state (int): The seed for the RNG min_count_threshold (int): The minimum sum of each row/column for that row/column to be passed to PCA (this filter is prior to any subsetting that occurs). Returns: A PCA object """ if random_state is None: random_state = analysis_constants.RANDOM_STATE np.random.seed(0) # Threshold the rows/columns of matrix, will throw error if an empty matrix results. thresholded_matrix, _, thresholded_features = matrix.select_axes_above_threshold( min_count_threshold) # If requested, we can subsample some of the barcodes to get a smaller matrix for PCA pca_bc_indices = np.arange(thresholded_matrix.bcs_dim) if pca_bcs is None: pca_bcs = thresholded_matrix.bcs_dim pca_bc_indices = np.arange(thresholded_matrix.bcs_dim) elif pca_bcs < thresholded_matrix.bcs_dim: pca_bc_indices = np.sort( np.random.choice(np.arange(thresholded_matrix.bcs_dim), size=pca_bcs, replace=False)) elif pca_bcs > thresholded_matrix.bcs_dim: msg = ( "You requested {} barcodes but the matrix after thresholding only " "included {}, so the smaller amount is being used.").format( pca_bcs, thresholded_matrix.bcs_dim) print(msg) pca_bcs = thresholded_matrix.bcs_dim pca_bc_indices = np.arange(thresholded_matrix.bcs_dim) # If requested, select fewer features to use by selecting the features with highest normalized dispersion if pca_features is None: pca_features = thresholded_matrix.features_dim elif pca_features > thresholded_matrix.features_dim: msg = ( "You requested {} features but the matrix after thresholding only included {} features," "so the smaller amount is being used.").format( pca_features, thresholded_matrix.features_dim) print(msg) pca_features = thresholded_matrix.features_dim # Calc mean and variance of counts after normalizing # But don't transform to log space, in order to preserve the mean-variance relationship m = analysis_stats.normalize_by_umi(thresholded_matrix) # Get mean and variance of rows (mu, var) = analysis_stats.summarize_columns(m.T) dispersion = analysis_stats.get_normalized_dispersion( mu.squeeze(), var.squeeze()) # TODO set number of bins? pca_feature_indices = np.argsort(dispersion)[-pca_features:] # Now determine how many components. if n_pca_components is None: n_pca_components = analysis_constants.PCA_N_COMPONENTS_DEFAULT likely_matrix_rank = min(pca_features, pca_bcs) if likely_matrix_rank < n_pca_components: if min_count_threshold == DEFAULT_RUNPCA_THRESHOLD: # Kick back to run_pca stage so it can retry with no threshold, this is for historical reasons raise MatrixRankTooSmallException() else: print(( "There are fewer nonzero features or barcodes ({}) than requested " "PCA components ({}); reducing the number of components." ).format(likely_matrix_rank, n_pca_components)) n_pca_components = likely_matrix_rank if (likely_matrix_rank * 0.5) <= float(n_pca_components): print( "Requested number of PCA components is large relative to the matrix size, an exact approach to matrix factorization may be faster." ) # Note, after subsetting it is possible some rows/cols in pca_mat have counts below the threshold. # However, we are not performing a second thresholding as in practice subsetting is not used and we explain # that thresholding occurs prior to subsetting in the doc string. pca_mat = thresholded_matrix.select_barcodes( pca_bc_indices).select_features(pca_feature_indices) (pca_norm_mat, pca_center, pca_scale) = normalize_and_transpose(pca_mat) (u, d, v, _, _) = irlb(pca_norm_mat, n_pca_components, center=pca_center.squeeze(), scale=pca_scale.squeeze(), random_state=random_state) # make sure to project the matrix before centering, to avoid densification (full_norm_mat, full_center, full_scale) = normalize_and_transpose(matrix) sparsefuncs.inplace_column_scale( full_norm_mat, 1 / full_scale.squeeze()) # can have some zeros here # Get a coordinate map so we know which columns in the old matrix correspond to columns in the new org_cols_used = get_original_columns_used(thresholded_features, pca_feature_indices) transformed_irlba_matrix = full_norm_mat[:, org_cols_used].dot(v) - ( full_center / full_scale)[:, org_cols_used].dot(v) irlba_components = np.zeros((n_pca_components, matrix.features_dim)) irlba_components[:, org_cols_used] = v.T # calc proportion of variance explained variance_sum = len( pca_feature_indices ) # each feature has variance=1, mean=0 after normalization variance_explained = np.square(d) / ( (len(pca_bc_indices) - 1) * variance_sum) features_selected = np.array( [f.id for f in matrix.feature_ref.feature_defs])[org_cols_used] # Now project back up the dispersion to return. full_dispersion = np.empty(matrix.features_dim) full_dispersion[:] = np.nan full_dispersion[thresholded_features] = dispersion # sanity check dimensions assert transformed_irlba_matrix.shape == (matrix.bcs_dim, n_pca_components) assert irlba_components.shape == (n_pca_components, matrix.features_dim) assert variance_explained.shape == (n_pca_components, ) return PCA(transformed_irlba_matrix, irlba_components, variance_explained, full_dispersion, features_selected)
def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, sample_weight=None, return_mean=False): """ Centers data to have mean zero along axis 0. If fit_intercept=False or if the X is a sparse matrix, no centering is done, but normalization can still be applied. The function returns the statistics necessary to reconstruct the input data, which are X_offset, y_offset, X_scale, such that the output X = (X - X_offset) / X_scale X_scale is the L2 norm of X - X_offset. If sample_weight is not None, then the weighted mean of X and y is zero, and not the mean itself. If return_mean=True, the mean, eventually weighted, is returned, independently of whether X was centered (option used for optimization with sparse data in coordinate_descend). This is here because nearly all linear models will want their data to be centered. This function also systematically makes y consistent with X.dtype """ if isinstance(sample_weight, numbers.Number): sample_weight = None X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'], dtype=FLOAT_DTYPES) y = np.asarray(y, dtype=X.dtype) if fit_intercept: if sp.issparse(X): X_offset, X_var = mean_variance_axis(X, axis=0) if not return_mean: X_offset[:] = X.dtype.type(0) if normalize: # TODO: f_normalize could be used here as well but the function # inplace_csr_row_normalize_l2 must be changed such that it # can return also the norms computed internally # transform variance to norm in-place X_var *= X.shape[0] X_scale = np.sqrt(X_var, X_var) del X_var X_scale[X_scale == 0] = 1 inplace_column_scale(X, 1. / X_scale) else: X_scale = np.ones(X.shape[1], dtype=X.dtype) else: X_offset = np.average(X, axis=0, weights=sample_weight) X -= X_offset if normalize: X, X_scale = f_normalize(X, axis=0, copy=False, return_norm=True) else: X_scale = np.ones(X.shape[1], dtype=X.dtype) y_offset = np.average(y, axis=0, weights=sample_weight) y = y - y_offset else: X_offset = np.zeros(X.shape[1], dtype=X.dtype) X_scale = np.ones(X.shape[1], dtype=X.dtype) if y.ndim == 1: y_offset = X.dtype.type(0) else: y_offset = np.zeros(y.shape[1], dtype=X.dtype) return X, y, X_offset, y_offset, X_scale
def __apply_idf__(icm): if idf_type == 'none' or idf_type=='': pass elif idf_type == 'idf': skfun.inplace_column_scale(icm, np.log10(N/popularity)) elif idf_type == 'idfshrinked': skfun.inplace_column_scale(icm, np.log10(N / 1+popularity)) elif idf_type =='smooth': skfun.inplace_column_scale(icm, np.log10(1+(N/popularity) )) elif idf_type =='max': nt_max = np.max(nt) skfun.inplace_column_scale(icm, np.log10( nt_max/(1+popularity) )) elif idf_type =='square': skfun.inplace_column_scale(icm, sqrt( N/popularity)) elif idf_type == 'squaresmooth': skfun.inplace_column_scale(icm, sqrt(1+ N/popularity )) elif idf_type =='prob': skfun.inplace_column_scale(icm,np.log10( (N-popularity)/popularity )) else: raise AttributeError("n***a wut? idf ["+idf_type+"] not found")
def mapper(X): inplace_column_scale(X, 1 / self.scale_) return X
def scale(self, X, ddof=True): """Standardize a dataset along any axis Center to the mean and component wise scale to unit variance. Read more in the :ref:`User Guide <preprocessing_scaler>`. Parameters ---------- X : {array-like, sparse matrix} The data to center and scale. axis : int (0 by default) axis used to compute the means and standard deviations along. If 0, independently standardize each feature, otherwise (if 1) standardize each sample. with_mean : boolean, True by default If True, center the data before scaling. with_std : boolean, True by default If True, scale the data to unit variance (or equivalently, unit standard deviation). copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSC matrix and if axis is 1). Notes ----- This implementation will refuse to center scipy.sparse matrices since it would make them non-sparse and would potentially crash the program with memory exhaustion problems. Instead the caller is expected to either set explicitly `with_mean=False` (in that case, only variance scaling will be performed on the features of the CSC matrix) or to call `X.toarray()` if he/she expects the materialized dense array to fit in memory. To avoid memory copy the caller should pass a CSC matrix. NaNs are treated as missing values: disregarded to compute the statistics, and maintained during the data transformation. We use a biased estimator for the standard deviation, equivalent to `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to affect model performance. For a comparison of the different scalers, transformers, and normalizers, see :ref:`examples/preprocessing/plot_all_scaling.py <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`. See also -------- StandardScaler: Performs scaling to unit variance using the``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`). """ # noqa X = check_array(X, accept_sparse='csc', copy=self.copy, ensure_2d=False, estimator='the scale function', dtype=FLOAT_DTYPES, force_all_finite='allow-nan') if sparse.issparse(X): if self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` instead" " See docstring for motivation and alternatives.") if self.axis != 0: raise ValueError("Can only scale sparse matrix on axis=0, " " got axis=%d" % self.axis) if self.with_std: _, var = mean_variance_axis(X, axis=0) var = _handle_zeros_in_scale(var, copy=False) inplace_column_scale(X, 1 / np.sqrt(var)) else: X = np.asarray(X) if self.with_mean: mean_ = np.nanmean(X, self.axis) if self.with_std: if ddof: scale_ = np.std(X, axis=self.axis, ddof=1) else: scale_ = np.nanstd(X, axis) # Xr is a view on the original array that enables easy use of # broadcasting on the axis in which we are interested in Xr = np.rollaxis(X, self.axis) if self.with_mean: Xr -= mean_ mean_1 = np.nanmean(Xr, axis=0) # Verify that mean_1 is 'close to zero'. If X contains very # large values, mean_1 can also be very large, due to a lack of # precision of mean_. In this case, a pre-scaling of the # concerned feature is efficient, for instance by its mean or # maximum. if not np.allclose(mean_1, 0): Xr -= mean_1 if self.with_std: scale_ = _handle_zeros_in_scale(scale_, copy=False) Xr /= scale_ if self.with_mean: mean_2 = np.nanmean(Xr, axis=0) # If mean_2 is not 'close to zero', it comes from the fact that # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even # if mean_1 was close to zero. The problem is thus essentially # due to the lack of precision of mean_. A solution is then to # subtract the mean again: if not np.allclose(mean_2, 0): warnings.warn("Numerical issues were encountered " "when scaling the data " "and might not be solved. The standard " "deviation of the data is probably " "very close to 0. ") Xr -= mean_2 return X
def run_pca(matrix, pca_genes=None, pca_bcs=None, n_pca_components=None, random_state=None): if pca_genes is None: pca_genes = matrix.genes_dim if pca_bcs is None: pca_bcs = matrix.bcs_dim if n_pca_components is None: n_pca_components = cr_constants.PCA_N_COMPONENTS_DEFAULT if n_pca_components > pca_genes: print "There are fewer nonzero genes than PCA components; reducing the number of components." n_pca_components = pca_genes if random_state is None: random_state = cr_constants.RANDOM_STATE np.random.seed(0) (full_norm_mat, full_center, full_scale) = normalize_and_transpose(matrix) # initialize PCA subsets pca_bc_indices = np.arange(matrix.bcs_dim) pca_gene_indices = np.arange(matrix.genes_dim) # Calc mean and variance of counts after normalizing # But don't transform to log space, in order to preserve the mean-variance relationship m = normalize_by_umi(matrix) (mu, var) = summarize_columns(m.T) dispersion = get_normalized_dispersion( mu.squeeze(), var.squeeze()) # TODO set number of bins? pca_gene_indices = np.argsort(dispersion)[-pca_genes:] if pca_bcs < matrix.bcs_dim: pca_bc_indices = np.sort( np.random.choice(np.arange(matrix.bcs_dim), size=pca_bcs, replace=False)) pca_mat, _, pca_genes_nonzero = matrix.select_barcodes( pca_bc_indices).select_genes(pca_gene_indices).select_nonzero_axes() pca_gene_nonzero_indices = pca_gene_indices[pca_genes_nonzero] if pca_mat.genes_dim < 2 or pca_mat.bcs_dim < 2: print "Matrix is too small for further downsampling - num_pca_bcs and num_pca_genes will be ignored." pca_mat, _, pca_genes_nonzero = matrix.select_nonzero_axes() pca_gene_nonzero_indices = pca_genes_nonzero (pca_norm_mat, pca_center, pca_scale) = normalize_and_transpose(pca_mat) (u, d, v, _, _) = irlb(pca_norm_mat, n_pca_components, center=pca_center.squeeze(), scale=pca_scale.squeeze(), random_state=random_state) # make sure to project the matrix before centering, to avoid densification sparsefuncs.inplace_column_scale(full_norm_mat, 1 / full_scale.squeeze()) transformed_irlba_matrix = full_norm_mat[:, pca_gene_nonzero_indices].dot( v) - (full_center / full_scale)[:, pca_gene_nonzero_indices].dot(v) irlba_components = np.zeros((n_pca_components, matrix.genes_dim)) irlba_components[:, pca_gene_nonzero_indices] = v.T # calc proportion of variance explained variance_sum = len( pca_gene_indices ) # each gene has variance=1, mean=0 after normalization variance_explained = np.square(d) / ( (len(pca_bc_indices) - 1) * variance_sum) genes_selected = np.array([gene.id for gene in matrix.genes ])[pca_gene_nonzero_indices] # sanity check dimensions assert transformed_irlba_matrix.shape == (matrix.bcs_dim, n_pca_components) assert irlba_components.shape == (n_pca_components, matrix.genes_dim) assert variance_explained.shape == (n_pca_components, ) return PCA(transformed_irlba_matrix, irlba_components, variance_explained, dispersion, genes_selected)
def normalize_matrix(matr, scale): """normalize a matrix with some scale""" m = matr.copy().astype(np.float64) scale = np.median(scale) / scale sparsefuncs.inplace_column_scale(m, scale) return m
def scaling(pat_mat): mean, var = mean_variance_axis(pat_mat, axis=0) # var[var == 0.0] = 1.0 inplace_column_scale(pat_mat, 1 / np.sqrt(var))