Example #1
0
    def inverse_transform(self, X, copy=None):
        """Scale back the data to the original representation

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to scale along the features axis.
        """
        check_is_fitted(self, 'scale_')

        copy = copy if copy is not None else self.copy
        if not sparse.issparse(X):
            super(SparseScaler, self).inverse_transform(X, copy=copy)

        if self.with_mean:
            raise ValueError(
                "Cannot uncenter sparse matrices: pass `with_mean=False` "
                "instead See docstring for motivation and alternatives.")
        if not sparse.isspmatrix_csr(X):
            X = X.tocsr()
            copy = False
        if copy:
            X = X.copy()

        if self.mean_ is not None:
            sum(X, self.mean_)
        if self.scale_ is not None:
            inplace_column_scale(X, self.scale_)
        return X
Example #2
0
    def transform(self, X, y=None, copy=None):
        """Perform standardization by centering and scaling

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data used to scale along the features axis.
        """
        check_is_fitted(self, 'std_')

        copy = copy if copy is not None else self.copy
        X = check_array(X, copy=copy, accept_sparse="csc", ensure_2d=False)
        if warn_if_not_float(X, estimator=self):
            X = X.astype(np.float)
        if sparse.issparse(X):
            if self.center_sparse:
                for i in range(X.shape[1]):
                    X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i]

            elif self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")

            else:
                pass

            if self.std_ is not None:
                inplace_column_scale(X, 1 / self.std_)
        else:
            if self.with_mean:
                X -= self.mean_
            if self.with_std:
                X /= self.std_
        return X
def test_inplace_row_scale():
    rng = np.random.RandomState(0)
    X = sp.rand(100, 200, 0.05)
    Xr = X.tocsr()
    Xc = X.tocsc()
    XA = X.toarray()
    scale = rng.rand(100)
    XA *= scale.reshape(-1, 1)

    inplace_row_scale(Xc, scale)
    inplace_row_scale(Xr, scale)
    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
    assert_array_almost_equal(XA, Xc.toarray())
    assert_array_almost_equal(XA, Xr.toarray())
    with pytest.raises(TypeError):
        inplace_column_scale(X.tolil(), scale)

    X = X.astype(np.float32)
    scale = scale.astype(np.float32)
    Xr = X.tocsr()
    Xc = X.tocsc()
    XA = X.toarray()
    XA *= scale.reshape(-1, 1)
    inplace_row_scale(Xc, scale)
    inplace_row_scale(Xr, scale)
    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
    assert_array_almost_equal(XA, Xc.toarray())
    assert_array_almost_equal(XA, Xr.toarray())
    with pytest.raises(TypeError):
        inplace_column_scale(X.tolil(), scale)
Example #4
0
def scale_array(
    X,
    *,
    zero_center: bool = True,
    max_value: Optional[float] = None,
    copy: bool = False,
    return_mean_std: bool = False,
):
    if copy:
        X = X.copy()
    if not zero_center and max_value is not None:
        logg.info(  # Be careful of what? This should be more specific
            "... be careful when using `max_value` "
            "without `zero_center`.")

    mean, var = _get_mean_var(X)
    std = np.sqrt(var)
    if issparse(X):
        if zero_center:
            raise ValueError("Cannot zero-center sparse matrix.")
        sparsefuncs.inplace_column_scale(X, 1 / std)
    else:
        X -= mean
        std[std == 0] = 1e-12
        X /= std

    # do the clipping
    if max_value is not None:
        logg.debug(f"... clipping at max_value {max_value}")
        X[X > max_value] = max_value

    if return_mean_std:
        return X, mean, std
    else:
        return X
Example #5
0
    def inverse_transform(self, X, copy=None):
        """Scale back the data to the original representation

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data used to scale along the features axis.
        """
        check_is_fitted(self, 'std_')

        copy = copy if copy is not None else self.copy
        if sparse.issparse(X):
            if self.with_mean:
                raise ValueError(
                    "Cannot uncenter sparse matrices: pass `with_mean=False` "
                    "instead See docstring for motivation and alternatives.")
            if not sparse.isspmatrix_csr(X):
                X = X.tocsr()
                copy = False
            if copy:
                X = X.copy()
            if self.std_ is not None:
                inplace_column_scale(X, self.std_)
        else:
            X = np.asarray(X)
            if copy:
                X = X.copy()
            if self.with_std:
                X *= self.std_
            if self.with_mean:
                X += self.mean_
        return X
Example #6
0
    def transform(self, X, y=None, copy=None):
        """Perform standardization by centering and scaling

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data used to scale along the features axis.
        """
        copy = copy if copy is not None else self.copy
        X = check_arrays(X, copy=copy, sparse_format="csc")[0]
        if warn_if_not_float(X, estimator=self):
            X = X.astype(np.float)
        if sparse.issparse(X):
            if self.center_sparse:
                for i in range(X.shape[1]):
                    X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i]
            elif self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
            else:
                pass

            if self.std_ is not None:
                inplace_column_scale(X, 1 / self.std_)
        else:
            if self.with_mean:
                X -= self.mean_
            if self.with_std:
                X /= self.std_
        return X
    def transform(self, X, y=None, copy=None):
        """
        Perform standardization by centering and scaling using the parameters.

        :param X: Data matrix to scale.
        :type X: numpy.ndarray, shape [n_samples, n_features]
        :param y: Passthrough for scikit-learn ``Pipeline`` compatibility.
        :type y: None
        :param bool copy: Copy the X matrix.
        :return: Scaled version of the X data matrix.
        :rtype: numpy.ndarray, shape [n_samples, n_features]
        """
        check_is_fitted(self, 'scale_')

        copy = copy if copy is not None else self.copy

        X = check_array(X,
                        accept_sparse='csr',
                        copy=copy,
                        estimator=self,
                        dtype=FLOAT_DTYPES)

        if sparse.issparse(X):
            if self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
            if self.scale_ is not None:
                inplace_column_scale(X, 1 / self.scale_)
        else:
            if self.with_mean:
                X -= self.mean_
            if self.with_std:
                X /= self.scale_
        return X
    def inverse_transform(self, X, copy=None):
        """
        Scale back the data to the original representation.

        :param X: Scaled data matrix.
        :type X: numpy.ndarray, shape [n_samples, n_features]
        :param bool copy: Copy the X data matrix.
        :return: X data matrix with the scaling operation reverted.
        :rtype: numpy.ndarray, shape [n_samples, n_features]
        """
        check_is_fitted(self, 'scale_')

        copy = copy if copy is not None else self.copy
        if sparse.issparse(X):
            if self.with_mean:
                raise ValueError(
                    "Cannot uncenter sparse matrices: pass `with_mean=False` "
                    "instead See docstring for motivation and alternatives.")
            if not sparse.isspmatrix_csr(X):
                X = X.tocsr()
                copy = False
            if copy:
                X = X.copy()
            if self.scale_ is not None:
                inplace_column_scale(X, self.scale_)
        else:
            X = numpy.asarray(X)
            if copy:
                X = X.copy()
            if self.with_std:
                X *= self.scale_
            if self.with_mean:
                X += self.mean_

        return X
Example #9
0
    def inverse_transform(self, X):
        """Scale back the data to the original representation
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The rescaled data to be transformed back.
        Returns
        -------
        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
            Transformed array.
        """
        check_is_fitted(self)
        X = check_array(X,
                        accept_sparse=('csr', 'csc'),
                        copy=self.copy,
                        estimator=self,
                        dtype=FLOAT_DTYPES,
                        force_all_finite='allow-nan')

        if sparse.issparse(X):
            if self.with_scaling:
                inplace_column_scale(X, self.scale_)
        else:
            if self.with_scaling:
                X *= self.scale_
            if self.with_centering:
                X += self.center_
        return X
Example #10
0
    def transform(self, X):
        """Center and scale the data.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The data used to scale along the specified axis.
        Returns
        -------
        X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
            Transformed array.
        """
        check_is_fitted(self)
        X = self._validate_data(X,
                                accept_sparse=('csr', 'csc'),
                                copy=self.copy,
                                estimator=self,
                                dtype=FLOAT_DTYPES,
                                reset=False,
                                force_all_finite='allow-nan')

        if sparse.issparse(X):
            if self.with_scaling:
                inplace_column_scale(X, 1.0 / self.scale_)
        else:
            if self.with_centering:
                X -= self.center_
            if self.with_scaling:
                X /= self.scale_
        return X
Example #11
0
def sparse_center_data(X, y, fit_intercept, normalize=False):
    """
    Compute information needed to center data to have mean zero along
    axis 0. Be aware that X will not be centered since it would break
    the sparsity, but will be normalized if asked so.
    """
    if fit_intercept:
        # we might require not to change the csr matrix sometimes
        # store a copy if normalize is True.
        # Change dtype to float64 since mean_variance_axis accepts
        # it that way.
        if sp.isspmatrix(X) and X.getformat() == 'csr':
            X = sp.csr_matrix(X, copy=normalize, dtype=np.float64)
        else:
            X = sp.csc_matrix(X, copy=normalize, dtype=np.float64)

        X_offset, X_var = mean_variance_axis(X, axis=0)
        if normalize:
            # transform variance to std in-place
            X_var *= X.shape[0]
            X_std = np.sqrt(X_var, X_var)
            del X_var
            X_std[X_std == 0] = 1
            inplace_column_scale(X, 1. / X_std)
        else:
            X_std = np.ones(X.shape[1])
        y_offset = y.mean(axis=0)
        y = y - y_offset
    else:
        X_offset = np.zeros(X.shape[1])
        X_std = np.ones(X.shape[1])
        y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)

    return X, y, X_offset, y_offset, X_std
Example #12
0
    def inverse_transform(self, X, copy=None):
        """Scale back the data to the original representation

        Parameters
        ----------
        X : array-like with shape [n_samples, n_features]
            The data used to scale along the features axis.
        """
        copy = copy if copy is not None else self.copy
        if sparse.issparse(X):
            if self.with_mean:
                raise ValueError(
                    "Cannot uncenter sparse matrices: pass `with_mean=False` "
                    "instead See docstring for motivation and alternatives.")
            if not sparse.isspmatrix_csr(X):
                X = X.tocsr()
                copy = False
            if copy:
                X = X.copy()
            if self.std_ is not None:
                inplace_column_scale(X, self.std_)
        else:
            X = np.asarray(X)
            if copy:
                X = X.copy()
            if self.with_std:
                X *= self.std_
            if self.with_mean:
                X += self.mean_
        return X
Example #13
0
def normalize_by_umi(matrix):
    reads_per_bc = matrix.get_reads_per_bc()
    median_reads_per_bc = np.median(reads_per_bc)
    scaling_factors = median_reads_per_bc / reads_per_bc

    # Normalize each barcode's total count by median total count
    m = matrix.m.copy().astype(np.float64)
    sparsefuncs.inplace_column_scale(m, scaling_factors)

    return m
Example #14
0
def _scale_precomputed(X, column_means, column_vars, zero_center=True):
    scale = np.sqrt(column_vars)
    if zero_center:
        X -= column_means
        scale[scale == 0] = 1e-12
        X /= scale
    else:
        if issparse(X):
            sparsefuncs.inplace_column_scale(X, 1 / scale)
        else:
            X /= scale
def test_inplace_column_scale():
    rng = np.random.RandomState(0)
    X = sp.rand(100, 200, 0.05)
    Xr = X.tocsr()
    Xc = X.tocsc()
    XA = X.toarray()
    scale = rng.rand(200)
    XA *= scale

    inplace_column_scale(Xc, scale)
    inplace_column_scale(Xr, scale)
    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
    assert_array_almost_equal(XA, Xc.toarray())
    assert_array_almost_equal(XA, Xr.toarray())
    assert_raises(TypeError, inplace_column_scale, X.tolil(), scale)

    X = X.astype(np.float32)
    scale = scale.astype(np.float32)
    Xr = X.tocsr()
    Xc = X.tocsc()
    XA = X.toarray()
    XA *= scale
    inplace_column_scale(Xc, scale)
    inplace_column_scale(Xr, scale)
    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
    assert_array_almost_equal(XA, Xc.toarray())
    assert_array_almost_equal(XA, Xr.toarray())
    assert_raises(TypeError, inplace_column_scale, X.tolil(), scale)
def test_inplace_column_scale():
    rng = np.random.RandomState(0)
    X = sp.rand(100, 200, 0.05)
    Xr = X.tocsr()
    Xc = X.tocsc()
    XA = X.toarray()
    scale = rng.rand(200)
    XA *= scale

    inplace_column_scale(Xc, scale)
    inplace_column_scale(Xr, scale)
    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
    assert_array_almost_equal(XA, Xc.toarray())
    assert_array_almost_equal(XA, Xr.toarray())
    assert_raises(TypeError, inplace_column_scale, X.tolil(), scale)
Example #17
0
def _scale(X, zero_center=True):
    # - using sklearn.StandardScaler throws an error related to
    #   int to long trafo for very large matrices
    # - using X.multiply is slower
    #   the result differs very slightly, why?
    if True:
        mean, var = _get_mean_var(X)
        scale = np.sqrt(var)
        if issparse(X):
            if zero_center: raise ValueError('Cannot zero-center sparse matrix.')
            sparsefuncs.inplace_column_scale(X, 1/scale)
        else:
            X -= mean
            X /= scale
    else:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler(with_mean=zero_center, copy=False).partial_fit(X)
        # user R convention (unbiased estimator)
        scaler.scale_ *= np.sqrt(X.shape[0]/(X.shape[0]-1))
        scaler.transform(X)
def compute_sseq_params(x, zeta_quantile=SSEQ_ZETA_QUANTILE):
    """ Compute global parameters for the sSeq differential expression method.
    The key parameters are the shrunken feature-wise dispersions.

    This method was published in:
    Yu D, et al. (2013) Shrinkage estimation of dispersion in Negative Binomial models for RNA-seq experiments with small sample size.
    Bioinformatics. 29: 1275-1282. doi: 10.1093/bioinformatics/btt143
    Args:
      x - Sparse matrix (csc) of counts (feature x cell)
      zeta_quantile (float) - Quantile of method-of-moments dispersion estimates to
                              use as the shrinkage target zeta.
    Returns:
      A dictionary containing the sSeq parameters and some diagnostic info.
    """
    # Number of cells
    N = x.shape[1]

    # Number of features
    G = x.shape[0]

    # Estimate size factors and normalize the matrix for quick mean/var calcs
    size_factors = estimate_size_factors(x)
    # Cast to float to prevent truncation of 1 -> 0 for size factors < 1
    x_norm = scipy.sparse.csc_matrix(x, dtype=np.float64, copy=True)
    sparsefuncs.inplace_column_scale(x_norm, 1.0 / size_factors)

    # Estimate featurewise mean, variance, and dispersion by the method of moments
    # assuming that each feature follows a negative-binomial distribution.
    mean_g = np.squeeze(np.asarray(x_norm.mean(axis=1, dtype=np.float64)))
    # V[X] = E[X^2] - E[X]^2
    mean_sq_g = np.squeeze(np.asarray(x_norm.multiply(x_norm).mean(axis=1, dtype=np.float64)))
    var_g = mean_sq_g - np.square(mean_g)

    # Method of moments estimate of feature-wise dispersion (phi)
    # Only use features with non-zero variance in the following estimation
    use_g = var_g > 0
    phi_mm_g = np.zeros(G)
    phi_mm_g[use_g] = np.maximum(0, (float(N) * var_g[use_g] - mean_g[use_g] * np.sum(1.0 / size_factors)) /
                                 (np.square(mean_g[use_g]) * np.sum(1.0 / size_factors)))

    # Estimate the optimal global target dispersion (zeta_hat).
    # The true optimal zeta is that which minimizes the MSE vs the true dispersions.
    # The featurewise dispersions will be "shrunk" towards our estimate of zeta.

    # Use a high quantile of the MoM dispersion as our shrinkage target
    # per the rule of thumb in Yu, et al.
    zeta_hat = np.nanpercentile(phi_mm_g[use_g], 100.0 * zeta_quantile)

    # Compute delta, the optimal shrinkage towards zeta_hat
    # This defines a linear function that shrinks the MoM dispersion estimates
    mean_phi_mm_g = np.mean(phi_mm_g[use_g])
    delta = (np.sum(np.square(phi_mm_g[use_g] - mean_phi_mm_g)) / float(G - 1)) / \
            (np.sum(np.square(phi_mm_g[use_g] - zeta_hat)) / float(G - 2))

    # Compute the shrunken dispersion estimates
    # Interpolate between the MoM estimates and zeta_hat by delta
    phi_g = np.full(G, np.nan)
    if np.any(phi_mm_g[use_g] > 0):
        phi_g[use_g] = (1 - delta) * phi_mm_g[use_g] + delta * zeta_hat
    else:
        phi_g[use_g] = 0.0

    return {
        'N': N,
        'G': G,
        'size_factors': size_factors,
        'mean_g': mean_g,
        'var_g': var_g,
        'use_g': use_g,
        'phi_mm_g': phi_mm_g,
        'eval_zeta': None,
        'eval_asd': None,
        'asd_slope': None,
        'zeta_hat': zeta_hat,
        'delta': delta,
        'phi_g': phi_g,
    }
Example #19
0
def run_pca(matrix,
            pca_features=None,
            pca_bcs=None,
            n_pca_components=None,
            random_state=None,
            min_count_threshold=0):
    """ Run a PCA on the matrix using the IRLBA matrix factorization algorithm.  Prior to the PCA analysis, the
    matrix is modified so that all barcodes/columns have the same counts, and then the counts are transformed
    by a log2(1+X) operation.

    If desired, only a subset of features (e.g. sample rows) can be selected for PCA analysis.  Each feature is ranked
    by its dispersion relative to other features that have a similar mean count.  The top `pca_features` as ranked by
    this method will then be used for the PCA.

    One can also select to subset number of barcodes to use (e.g. sample columns), but in this case they are simply
    randomly sampled.

    Args:
        matrix (CountMatrix): The matrix to perform PCA on.
        pca_features (int): Number of features to subset from matrix and use in PCA. The top pca_features ranked by
                            dispersion are used
        pca_bcs (int): Number of barcodes to randomly sample for the matrix.
        n_pca_components (int): How many PCA components should be used.
        random_state (int): The seed for the RNG
        min_count_threshold (int): The minimum sum of each row/column for that row/column to be passed to PCA
                                   (this filter is prior to any subsetting that occurs).
    Returns:
        A PCA object
    """
    if random_state is None:
        random_state = analysis_constants.RANDOM_STATE
    np.random.seed(0)

    # Threshold the rows/columns of matrix, will throw error if an empty matrix results.
    thresholded_matrix, _, thresholded_features = matrix.select_axes_above_threshold(
        min_count_threshold)

    # If requested, we can subsample some of the barcodes to get a smaller matrix for PCA
    pca_bc_indices = np.arange(thresholded_matrix.bcs_dim)
    if pca_bcs is None:
        pca_bcs = thresholded_matrix.bcs_dim
        pca_bc_indices = np.arange(thresholded_matrix.bcs_dim)
    elif pca_bcs < thresholded_matrix.bcs_dim:
        pca_bc_indices = np.sort(
            np.random.choice(np.arange(thresholded_matrix.bcs_dim),
                             size=pca_bcs,
                             replace=False))
    elif pca_bcs > thresholded_matrix.bcs_dim:
        msg = (
            "You requested {} barcodes but the matrix after thresholding only "
            "included {}, so the smaller amount is being used.").format(
                pca_bcs, thresholded_matrix.bcs_dim)
        print(msg)
        pca_bcs = thresholded_matrix.bcs_dim
        pca_bc_indices = np.arange(thresholded_matrix.bcs_dim)

    # If requested, select fewer features to use by selecting the features with highest normalized dispersion
    if pca_features is None:
        pca_features = thresholded_matrix.features_dim
    elif pca_features > thresholded_matrix.features_dim:
        msg = (
            "You requested {} features but the matrix after thresholding only included {} features,"
            "so the smaller amount is being used.").format(
                pca_features, thresholded_matrix.features_dim)
        print(msg)
        pca_features = thresholded_matrix.features_dim
    # Calc mean and variance of counts after normalizing
    # But don't transform to log space, in order to preserve the mean-variance relationship
    m = analysis_stats.normalize_by_umi(thresholded_matrix)
    # Get mean and variance of rows
    (mu, var) = analysis_stats.summarize_columns(m.T)
    dispersion = analysis_stats.get_normalized_dispersion(
        mu.squeeze(), var.squeeze())  # TODO set number of bins?
    pca_feature_indices = np.argsort(dispersion)[-pca_features:]

    # Now determine how many components.
    if n_pca_components is None:
        n_pca_components = analysis_constants.PCA_N_COMPONENTS_DEFAULT
    likely_matrix_rank = min(pca_features, pca_bcs)
    if likely_matrix_rank < n_pca_components:
        if min_count_threshold == DEFAULT_RUNPCA_THRESHOLD:
            # Kick back to run_pca stage so it can retry with no threshold, this is for historical reasons
            raise MatrixRankTooSmallException()
        else:
            print((
                "There are fewer nonzero features or barcodes ({}) than requested "
                "PCA components ({}); reducing the number of components."
            ).format(likely_matrix_rank, n_pca_components))
            n_pca_components = likely_matrix_rank

    if (likely_matrix_rank * 0.5) <= float(n_pca_components):
        print(
            "Requested number of PCA components is large relative to the matrix size, an exact approach to matrix factorization may be faster."
        )

    # Note, after subsetting it is possible some rows/cols in pca_mat have counts below the threshold.
    # However, we are not performing a second thresholding as in practice subsetting is not used and we explain
    # that thresholding occurs prior to subsetting in the doc string.
    pca_mat = thresholded_matrix.select_barcodes(
        pca_bc_indices).select_features(pca_feature_indices)
    (pca_norm_mat, pca_center, pca_scale) = normalize_and_transpose(pca_mat)
    (u, d, v, _, _) = irlb(pca_norm_mat,
                           n_pca_components,
                           center=pca_center.squeeze(),
                           scale=pca_scale.squeeze(),
                           random_state=random_state)

    # make sure to project the matrix before centering, to avoid densification
    (full_norm_mat, full_center, full_scale) = normalize_and_transpose(matrix)
    sparsefuncs.inplace_column_scale(
        full_norm_mat, 1 / full_scale.squeeze())  # can have some zeros here
    # Get a coordinate map so we know which columns in the old matrix correspond to columns in the new
    org_cols_used = get_original_columns_used(thresholded_features,
                                              pca_feature_indices)
    transformed_irlba_matrix = full_norm_mat[:, org_cols_used].dot(v) - (
        full_center / full_scale)[:, org_cols_used].dot(v)
    irlba_components = np.zeros((n_pca_components, matrix.features_dim))
    irlba_components[:, org_cols_used] = v.T

    # calc proportion of variance explained
    variance_sum = len(
        pca_feature_indices
    )  # each feature has variance=1, mean=0 after normalization
    variance_explained = np.square(d) / (
        (len(pca_bc_indices) - 1) * variance_sum)
    features_selected = np.array(
        [f.id for f in matrix.feature_ref.feature_defs])[org_cols_used]

    # Now project back up the dispersion to return.
    full_dispersion = np.empty(matrix.features_dim)
    full_dispersion[:] = np.nan
    full_dispersion[thresholded_features] = dispersion

    # sanity check dimensions
    assert transformed_irlba_matrix.shape == (matrix.bcs_dim, n_pca_components)
    assert irlba_components.shape == (n_pca_components, matrix.features_dim)
    assert variance_explained.shape == (n_pca_components, )

    return PCA(transformed_irlba_matrix, irlba_components, variance_explained,
               full_dispersion, features_selected)
Example #20
0
def _preprocess_data(X,
                     y,
                     fit_intercept,
                     normalize=False,
                     copy=True,
                     sample_weight=None,
                     return_mean=False):
    """
    Centers data to have mean zero along axis 0. If fit_intercept=False or if
    the X is a sparse matrix, no centering is done, but normalization can still
    be applied. The function returns the statistics necessary to reconstruct
    the input data, which are X_offset, y_offset, X_scale, such that the output
        X = (X - X_offset) / X_scale
    X_scale is the L2 norm of X - X_offset. If sample_weight is not None,
    then the weighted mean of X and y is zero, and not the mean itself. If
    return_mean=True, the mean, eventually weighted, is returned, independently
    of whether X was centered (option used for optimization with sparse data in
    coordinate_descend).
    This is here because nearly all linear models will want their data to be
    centered. This function also systematically makes y consistent with X.dtype
    """

    if isinstance(sample_weight, numbers.Number):
        sample_weight = None

    X = check_array(X,
                    copy=copy,
                    accept_sparse=['csr', 'csc'],
                    dtype=FLOAT_DTYPES)
    y = np.asarray(y, dtype=X.dtype)

    if fit_intercept:
        if sp.issparse(X):
            X_offset, X_var = mean_variance_axis(X, axis=0)
            if not return_mean:
                X_offset[:] = X.dtype.type(0)

            if normalize:

                # TODO: f_normalize could be used here as well but the function
                # inplace_csr_row_normalize_l2 must be changed such that it
                # can return also the norms computed internally

                # transform variance to norm in-place
                X_var *= X.shape[0]
                X_scale = np.sqrt(X_var, X_var)
                del X_var
                X_scale[X_scale == 0] = 1
                inplace_column_scale(X, 1. / X_scale)
            else:
                X_scale = np.ones(X.shape[1], dtype=X.dtype)

        else:
            X_offset = np.average(X, axis=0, weights=sample_weight)
            X -= X_offset
            if normalize:
                X, X_scale = f_normalize(X,
                                         axis=0,
                                         copy=False,
                                         return_norm=True)
            else:
                X_scale = np.ones(X.shape[1], dtype=X.dtype)
        y_offset = np.average(y, axis=0, weights=sample_weight)
        y = y - y_offset
    else:
        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
        X_scale = np.ones(X.shape[1], dtype=X.dtype)
        if y.ndim == 1:
            y_offset = X.dtype.type(0)
        else:
            y_offset = np.zeros(y.shape[1], dtype=X.dtype)

    return X, y, X_offset, y_offset, X_scale
Example #21
0
 def __apply_idf__(icm):
     if idf_type == 'none' or idf_type=='':
         pass
     elif idf_type == 'idf':
         skfun.inplace_column_scale(icm, np.log10(N/popularity))
     elif idf_type == 'idfshrinked':
         skfun.inplace_column_scale(icm, np.log10(N / 1+popularity))
     elif idf_type =='smooth':
         skfun.inplace_column_scale(icm, np.log10(1+(N/popularity) ))
     elif idf_type =='max':
         nt_max = np.max(nt)
         skfun.inplace_column_scale(icm, np.log10( nt_max/(1+popularity) ))
     elif idf_type =='square':
         skfun.inplace_column_scale(icm,  sqrt( N/popularity))
     elif idf_type == 'squaresmooth':
         skfun.inplace_column_scale(icm, sqrt(1+ N/popularity ))
     elif idf_type =='prob':
         skfun.inplace_column_scale(icm,np.log10( (N-popularity)/popularity ))
     else:
         raise AttributeError("n***a wut? idf ["+idf_type+"] not found")
Example #22
0
 def mapper(X):
     inplace_column_scale(X, 1 / self.scale_)
     return X
Example #23
0
    def scale(self, X, ddof=True):
        
        """Standardize a dataset along any axis

        Center to the mean and component wise scale to unit variance.

        Read more in the :ref:`User Guide <preprocessing_scaler>`.

        Parameters
        ----------
        X : {array-like, sparse matrix}
            The data to center and scale.

        axis : int (0 by default)
            axis used to compute the means and standard deviations along. If 0,
            independently standardize each feature, otherwise (if 1) standardize
            each sample.

        with_mean : boolean, True by default
            If True, center the data before scaling.

        with_std : boolean, True by default
            If True, scale the data to unit variance (or equivalently,
            unit standard deviation).

        copy : boolean, optional, default True
            set to False to perform inplace row normalization and avoid a
            copy (if the input is already a numpy array or a scipy.sparse
            CSC matrix and if axis is 1).

        Notes
        -----
        This implementation will refuse to center scipy.sparse matrices
        since it would make them non-sparse and would potentially crash the
        program with memory exhaustion problems.

        Instead the caller is expected to either set explicitly
        `with_mean=False` (in that case, only variance scaling will be
        performed on the features of the CSC matrix) or to call `X.toarray()`
        if he/she expects the materialized dense array to fit in memory.

        To avoid memory copy the caller should pass a CSC matrix.

        NaNs are treated as missing values: disregarded to compute the statistics,
        and maintained during the data transformation.

        We use a biased estimator for the standard deviation, equivalent to
        `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to
        affect model performance.

        For a comparison of the different scalers, transformers, and normalizers,
        see :ref:`examples/preprocessing/plot_all_scaling.py
        <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.

        See also
        --------
        StandardScaler: Performs scaling to unit variance using the``Transformer`` API
            (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`).

        """  # noqa
        X = check_array(X, accept_sparse='csc', copy=self.copy, ensure_2d=False,
                        estimator='the scale function', dtype=FLOAT_DTYPES,
                        force_all_finite='allow-nan')
        if sparse.issparse(X):
            if self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` instead"
                    " See docstring for motivation and alternatives.")
            if self.axis != 0:
                raise ValueError("Can only scale sparse matrix on axis=0, "
                                " got axis=%d" % self.axis)
            if self.with_std:
                _, var = mean_variance_axis(X, axis=0)
                var = _handle_zeros_in_scale(var, copy=False)
                inplace_column_scale(X, 1 / np.sqrt(var))
        else:
            X = np.asarray(X)
            if self.with_mean:
                mean_ = np.nanmean(X, self.axis)
            if self.with_std:
                if ddof:
                    scale_ = np.std(X, axis=self.axis, ddof=1)
                else:
                    scale_ = np.nanstd(X, axis)
                
            # Xr is a view on the original array that enables easy use of
            # broadcasting on the axis in which we are interested in
            Xr = np.rollaxis(X, self.axis)
            if self.with_mean:
                Xr -= mean_
                mean_1 = np.nanmean(Xr, axis=0)
                # Verify that mean_1 is 'close to zero'. If X contains very
                # large values, mean_1 can also be very large, due to a lack of
                # precision of mean_. In this case, a pre-scaling of the
                # concerned feature is efficient, for instance by its mean or
                # maximum.
                if not np.allclose(mean_1, 0):
                    Xr -= mean_1
            if self.with_std:
                scale_ = _handle_zeros_in_scale(scale_, copy=False)
                Xr /= scale_
                if self.with_mean:
                    mean_2 = np.nanmean(Xr, axis=0)
                    # If mean_2 is not 'close to zero', it comes from the fact that
                    # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even
                    # if mean_1 was close to zero. The problem is thus essentially
                    # due to the lack of precision of mean_. A solution is then to
                    # subtract the mean again:
                    if not np.allclose(mean_2, 0):
                        warnings.warn("Numerical issues were encountered "
                                    "when scaling the data "
                                    "and might not be solved. The standard "
                                    "deviation of the data is probably "
                                    "very close to 0. ")
                        Xr -= mean_2
        return X
Example #24
0
def run_pca(matrix,
            pca_genes=None,
            pca_bcs=None,
            n_pca_components=None,
            random_state=None):
    if pca_genes is None:
        pca_genes = matrix.genes_dim
    if pca_bcs is None:
        pca_bcs = matrix.bcs_dim
    if n_pca_components is None:
        n_pca_components = cr_constants.PCA_N_COMPONENTS_DEFAULT
        if n_pca_components > pca_genes:
            print "There are fewer nonzero genes than PCA components; reducing the number of components."
            n_pca_components = pca_genes
    if random_state is None:
        random_state = cr_constants.RANDOM_STATE

    np.random.seed(0)

    (full_norm_mat, full_center, full_scale) = normalize_and_transpose(matrix)

    # initialize PCA subsets
    pca_bc_indices = np.arange(matrix.bcs_dim)
    pca_gene_indices = np.arange(matrix.genes_dim)

    # Calc mean and variance of counts after normalizing
    # But don't transform to log space, in order to preserve the mean-variance relationship
    m = normalize_by_umi(matrix)
    (mu, var) = summarize_columns(m.T)
    dispersion = get_normalized_dispersion(
        mu.squeeze(), var.squeeze())  # TODO set number of bins?

    pca_gene_indices = np.argsort(dispersion)[-pca_genes:]

    if pca_bcs < matrix.bcs_dim:
        pca_bc_indices = np.sort(
            np.random.choice(np.arange(matrix.bcs_dim),
                             size=pca_bcs,
                             replace=False))

    pca_mat, _, pca_genes_nonzero = matrix.select_barcodes(
        pca_bc_indices).select_genes(pca_gene_indices).select_nonzero_axes()
    pca_gene_nonzero_indices = pca_gene_indices[pca_genes_nonzero]

    if pca_mat.genes_dim < 2 or pca_mat.bcs_dim < 2:
        print "Matrix is too small for further downsampling - num_pca_bcs and num_pca_genes will be ignored."
        pca_mat, _, pca_genes_nonzero = matrix.select_nonzero_axes()
        pca_gene_nonzero_indices = pca_genes_nonzero

    (pca_norm_mat, pca_center, pca_scale) = normalize_and_transpose(pca_mat)

    (u, d, v, _, _) = irlb(pca_norm_mat,
                           n_pca_components,
                           center=pca_center.squeeze(),
                           scale=pca_scale.squeeze(),
                           random_state=random_state)

    # make sure to project the matrix before centering, to avoid densification
    sparsefuncs.inplace_column_scale(full_norm_mat, 1 / full_scale.squeeze())
    transformed_irlba_matrix = full_norm_mat[:, pca_gene_nonzero_indices].dot(
        v) - (full_center / full_scale)[:, pca_gene_nonzero_indices].dot(v)
    irlba_components = np.zeros((n_pca_components, matrix.genes_dim))
    irlba_components[:, pca_gene_nonzero_indices] = v.T

    # calc proportion of variance explained
    variance_sum = len(
        pca_gene_indices
    )  # each gene has variance=1, mean=0 after normalization
    variance_explained = np.square(d) / (
        (len(pca_bc_indices) - 1) * variance_sum)

    genes_selected = np.array([gene.id for gene in matrix.genes
                               ])[pca_gene_nonzero_indices]

    # sanity check dimensions
    assert transformed_irlba_matrix.shape == (matrix.bcs_dim, n_pca_components)
    assert irlba_components.shape == (n_pca_components, matrix.genes_dim)
    assert variance_explained.shape == (n_pca_components, )

    return PCA(transformed_irlba_matrix, irlba_components, variance_explained,
               dispersion, genes_selected)
def normalize_matrix(matr, scale):
    """normalize a matrix with some scale"""
    m = matr.copy().astype(np.float64)
    scale = np.median(scale) / scale
    sparsefuncs.inplace_column_scale(m, scale)
    return m
Example #26
0
 def mapper(X):
     inplace_column_scale(X, 1 / self.scale_)
     return X
Example #27
0
def scaling(pat_mat):
    mean, var = mean_variance_axis(pat_mat, axis=0)
    # var[var == 0.0] = 1.0
    inplace_column_scale(pat_mat, 1 / np.sqrt(var))