Exemple #1
0
    def transform(self, X):
        """
        Used to transform data after fiting
        :param X: list
        :return: list
        """
        X = check_array(X,
                        self.sparse_format,
                        copy=self.copy,
                        estimator='the normalize function',
                        dtype=FLOAT_DTYPES)
        if self.axis == 0:
            X = X.T
        if sparse.issparse(X):
            if self.norm == 'l1':
                inplace_csr_row_normalize_l1(X)
            elif self.norm == 'l2':
                inplace_csr_row_normalize_l2(X)
            elif self.norm == 'max':
                norms_elementwise = self.norms.repeat(np.diff(X.indptr))
                mask = norms_elementwise != 0
                X.data[mask] /= norms_elementwise[mask]
        else:
            X /= self.norms[:, np.newaxis]

        if self.axis == 0:
            X = X.T

        return X
Exemple #2
0
    def fit(self, X):
        """
        Used to fit Noramlizer with data
        :param X: list
        :return: nothing
        """
        if self.norm not in ('l1', 'l2', 'max'):
            raise ValueError("'%s' is not a supported norm" % self.norm)

        if self.axis == 0:
            self.sparse_format = 'csc'
        elif self.axis == 1:
            self.sparse_format = 'csr'
        else:
            raise ValueError("'%d' is not a supported axis" % self.axis)

        X = check_array(X,
                        self.sparse_format,
                        copy=self.copy,
                        estimator='the normalize function',
                        dtype=FLOAT_DTYPES)
        if self.axis == 0:
            X = X.T

        if sparse.issparse(X):
            if self.norm == 'l1':
                inplace_csr_row_normalize_l1(X)
            elif self.norm == 'l2':
                inplace_csr_row_normalize_l2(X)
            elif self.norm == 'max':
                _, self.norms = min_max_axis(X, 1)
        else:
            if self.norm == 'l1':
                self.norms = np.abs(X).sum(axis=1)
            elif self.norm == 'l2':
                self.norms = row_norms(X)
            elif self.norm == 'max':
                self.norms = np.max(X, axis=1)
            self.norms = _handle_zeros_in_scale(self.norms, copy=False)
def normalize(X, norm='l2', axis=1, copy=True, return_norm=False, shrink=0):
    """Scale input vectors individually to unit norm (vector length).

    Read more in the :ref:`User Guide <preprocessing_normalization>`.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape [n_samples, n_features]
        The data to normalize, element by element.
        scipy.sparse matrices should be in CSR format to avoid an
        un-necessary copy.

    norm : 'l1', 'l2', or 'max', optional ('l2' by default)
        The norm to use to normalize each non zero sample (or each non-zero
        feature if axis is 0).

    axis : 0 or 1, optional (1 by default)
        axis used to normalize the data along. If 1, independently normalize
        each sample, otherwise (if 0) normalize each feature.

    copy : boolean, optional, default True
        set to False to perform inplace row normalization and avoid a
        copy (if the input is already a numpy array or a scipy.sparse
        CSR matrix and if axis is 1).

    return_norm : boolean, default False
        whether to return the computed norms

    Returns
    -------
    X : {array-like, sparse matrix}, shape [n_samples, n_features]
        Normalized input X.

    norms : array, shape [n_samples] if axis=1 else [n_features]
        An array of norms along given axis for X.
        When X is sparse, a NotImplementedError will be raised
        for norm 'l1' or 'l2'.

    See also
    --------
    Normalizer: Performs normalization using the ``Transformer`` API
        (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`).

    Notes
    -----
    For a comparison of the different scalers, transformers, and normalizers,
    see :ref:`examples/preprocessing/plot_all_scaling.py
    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.

    """
    if norm not in ('l1', 'l2', 'max'):
        raise ValueError("'%s' is not a supported norm" % norm)

    if axis == 0:
        sparse_format = 'csc'
    elif axis == 1:
        sparse_format = 'csr'
    else:
        raise ValueError("'%d' is not a supported axis" % axis)

    X = check_array(X,
                    sparse_format,
                    copy=copy,
                    estimator='the normalize function',
                    dtype=FLOAT_DTYPES)
    if axis == 0:
        X = X.T

    if sparse.issparse(X):
        if return_norm and norm in ('l1', 'l2'):
            raise NotImplementedError("return_norm=True is not implemented "
                                      "for sparse matrices with norm 'l1' "
                                      "or norm 'l2'")
        if norm == 'l1':
            inplace_csr_row_normalize_l1(X)
        elif norm == 'l2':
            inplace_csr_row_normalize_l2(X, shrink)
        elif norm == 'max':
            _, norms = min_max_axis(X, 1)
            norms_elementwise = norms.repeat(np.diff(X.indptr))
            mask = norms_elementwise != 0
            X.data[mask] /= norms_elementwise[mask]
    else:
        if norm == 'l1':
            norms = np.abs(X).sum(axis=1)
        elif norm == 'l2':
            norms = row_norms(X)
        elif norm == 'max':
            norms = np.max(X, axis=1)
        norms = _handle_zeros_in_scale(norms, copy=False)
        X /= norms[:, np.newaxis]

    if axis == 0:
        X = X.T

    if return_norm:
        return X, norms
    else:
        return X
Exemple #4
0
 def normalize(self, X, norm='l2', axis=1, copy=True):
     """Normalize a dataset along any axis
 
     Parameters
     ----------
     X : array or scipy.sparse matrix with shape [n_samples, n_features]
         The data to normalize, element by element.
         scipy.sparse matrices should be in CSR format to avoid an
         un-necessary copy.
 
     norm : 'l1' or 'l2', optional ('l2' by default)
         The norm to use to normalize each non zero sample (or each non-zero
         feature if axis is 0).
 
     axis : 0 or 1, optional (1 by default)
         axis used to normalize the data along. If 1, independently normalize
         each sample, otherwise (if 0) normalize each feature.
 
     copy : boolean, optional, default is True
         set to False to perform inplace row normalization and avoid a
         copy (if the input is already a numpy array or a scipy.sparse
         CSR matrix and if axis is 1).
 
     See also
     --------
     :class:`sklearn.preprocessing.Normalizer` to perform normalization
     using the ``Transformer`` API (e.g. as part of a preprocessing
     :class:`sklearn.pipeline.Pipeline`)
     """
     if norm not in ('l1', 'l2'):
         raise ValueError("'%s' is not a supported norm" % norm)
 
     if axis == 0:
         sparse_format = 'csc'
     elif axis == 1:
         sparse_format = 'csr'
     else:
         raise ValueError("'%d' is not a supported axis" % axis)
 
     X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0]
     warn_if_not_float(X, 'The normalize function')
     if axis == 0:
         X = X.T
 
     if sparse.issparse(X):
         if norm == 'l1':
             inplace_csr_row_normalize_l1(X)
         elif norm == 'l2':
             inplace_csr_row_normalize_l2(X)
     else:
         if norm == 'l1':
             norms = np.abs(X).sum(axis=1)
             norms[norms == 0.0] = 1.0
         elif norm == 'l2':
             norms = row_norms(X)
             norms[norms == 0.0] = 1.0
         X /= norms[:, np.newaxis]
 
     if axis == 0:
         X = X.T
 
     return X
    def _one_fit(self):
        if self.verbose:
            print("\nCreating synthetic doublets...")
        self._createDoublets()

        # Normalize combined augmented set
        if self.verbose:
            print("Normalizing...")
        if self.normalizer is not None:
            aug_counts = self.normalizer(
                sp_sparse.vstack((self._raw_counts, self._raw_synthetics)))
        else:
            # Follows doubletdetection.plot.normalize_counts, but uses memoized normed raw_counts
            synth_lib_size = np.sum(self._raw_synthetics, axis=1).A1
            aug_lib_size = np.concatenate([self._lib_size, synth_lib_size])
            normed_synths = self._raw_synthetics.copy()
            inplace_csr_row_normalize_l1(normed_synths)
            aug_counts = sp_sparse.vstack(
                (self._normed_raw_counts, normed_synths))
            aug_counts = np.log(aug_counts.A * np.median(aug_lib_size) + 0.1)

        self._norm_counts = aug_counts[:self._num_cells]
        self._synthetics = aug_counts[self._num_cells:]

        aug_counts = anndata.AnnData(aug_counts)
        aug_counts.obs["n_counts"] = aug_lib_size
        if self.standard_scaling is True:
            sc.pp.scale(aug_counts, max_value=15)

        if self.verbose:
            print("Running PCA...")
        sc.tl.pca(aug_counts,
                  n_comps=self.n_components,
                  random_state=self.random_state)
        if self.verbose:
            print("Clustering augmented data set...\n")
        if self.use_phenograph:
            f = io.StringIO()
            with redirect_stdout(f):
                fullcommunities, _, _ = phenograph.cluster(
                    aug_counts.obsm["X_pca"], **self.phenograph_parameters)
            out = f.getvalue()
            if self.verbose:
                print(out)
        else:
            sc.pp.neighbors(
                aug_counts,
                random_state=self.random_state,
                method="umap",
                n_neighbors=10,
            )
            sc.tl.louvain(aug_counts,
                          random_state=self.random_state,
                          resolution=4,
                          directed=False)
            fullcommunities = np.array(aug_counts.obs["louvain"], dtype=int)
        min_ID = min(fullcommunities)
        self.communities_ = fullcommunities[:self._num_cells]
        self.synth_communities_ = fullcommunities[self._num_cells:]
        community_sizes = [
            np.count_nonzero(fullcommunities == i)
            for i in np.unique(fullcommunities)
        ]
        if self.verbose:
            print("Found clusters [{0}, ... {2}], with sizes: {1}\n".format(
                min(fullcommunities), community_sizes, max(fullcommunities)))

        # Count number of fake doublets in each community and assign score
        # Number of synth/orig cells in each cluster.
        synth_cells_per_comm = collections.Counter(self.synth_communities_)
        orig_cells_per_comm = collections.Counter(self.communities_)
        community_IDs = orig_cells_per_comm.keys()
        community_scores = {
            i: float(synth_cells_per_comm[i]) /
            (synth_cells_per_comm[i] + orig_cells_per_comm[i])
            for i in community_IDs
        }
        scores = np.array([community_scores[i] for i in self.communities_])

        community_log_p_values = {
            i: hypergeom.logsf(
                synth_cells_per_comm[i],
                aug_counts.shape[0],
                self._synthetics.shape[0],
                synth_cells_per_comm[i] + orig_cells_per_comm[i],
            )
            for i in community_IDs
        }
        log_p_values = np.array(
            [community_log_p_values[i] for i in self.communities_])

        if min_ID < 0:
            scores[self.communities_ == -1] = np.nan
            log_p_values[self.communities_ == -1] = np.nan

        return scores, log_p_values
    def fit(self, raw_counts):
        """Fits the classifier on raw_counts.

        Args:
            raw_counts (array-like): Count matrix, oriented cells by genes.

        Sets:
            all_scores_, all_p_values_, all_log_p_values_, communities_,
            top_var_genes, parents, synth_communities

        Returns:
            The fitted classifier.
        """

        raw_counts = check_array(
            raw_counts,
            accept_sparse="csr",
            force_all_finite=True,
            ensure_2d=True,
            dtype="float32",
        )

        if sp_sparse.issparse(raw_counts) is not True:
            if self.verbose:
                print("Sparsifying matrix.")
            raw_counts = csr_matrix(raw_counts)

        if self.n_top_var_genes > 0:
            if self.n_top_var_genes < raw_counts.shape[1]:
                gene_variances = (np.array(raw_counts.power(2).mean(axis=0)) -
                                  (np.array(raw_counts.mean(axis=0)))**2)[0]
                top_var_indexes = np.argsort(gene_variances)
                self.top_var_genes_ = top_var_indexes[-self.n_top_var_genes:]
                # csc if faster for column indexing
                raw_counts = raw_counts.tocsc()
                raw_counts = raw_counts[:, self.top_var_genes_]
                raw_counts = raw_counts.tocsr()

        self._raw_counts = raw_counts
        (self._num_cells, self._num_genes) = self._raw_counts.shape
        if self.normalizer is None:
            # Memoize these; default normalizer treats these invariant for all synths
            self._lib_size = np.sum(raw_counts, axis=1).A1
            self._normed_raw_counts = self._raw_counts.copy()
            inplace_csr_row_normalize_l1(self._normed_raw_counts)

        self.all_scores_ = np.zeros((self.n_iters, self._num_cells))
        self.all_log_p_values_ = np.zeros((self.n_iters, self._num_cells))
        all_communities = np.zeros((self.n_iters, self._num_cells))
        all_parents = []
        all_synth_communities = np.zeros(
            (self.n_iters, int(self.boost_rate * self._num_cells)))

        for i in tqdm(range(self.n_iters)):
            if self.verbose:
                print("Iteration {:3}/{}".format(i + 1, self.n_iters))
            self.all_scores_[i], self.all_log_p_values_[i] = self._one_fit()
            all_communities[i] = self.communities_
            all_parents.append(self.parents_)
            all_synth_communities[i] = self.synth_communities_

        # Release unneeded large data vars
        del self._raw_counts
        del self._norm_counts
        del self._raw_synthetics
        del self._synthetics
        if self.normalizer is None:
            del self._normed_raw_counts
            del self._lib_size

        self.communities_ = all_communities
        self.parents_ = all_parents
        self.synth_communities_ = all_synth_communities
        self.all_p_values_ = np.exp(self.all_log_p_values_)

        return self
def normalize(X, norm='l2', axis=1, copy=True):
    """Scale input vectors individually to unit norm (vector length).

    Parameters
    ----------
    X : array or scipy.sparse matrix with shape [n_samples, n_features]
        The data to normalize, element by element.
        scipy.sparse matrices should be in CSR format to avoid an
        un-necessary copy.

    norm : 'l1' or 'l2', optional ('l2' by default)
        The norm to use to normalize each non zero sample (or each non-zero
        feature if axis is 0).

    axis : 0 or 1, optional (1 by default)
        axis used to normalize the data along. If 1, independently normalize
        each sample, otherwise (if 0) normalize each feature.

    copy : boolean, optional, default True
        set to False to perform inplace row normalization and avoid a
        copy (if the input is already a numpy array or a scipy.sparse
        CSR matrix and if axis is 1).

    See also
    --------
    :class:`sklearn.preprocessing.Normalizer` to perform normalization
    using the ``Transformer`` API (e.g. as part of a preprocessing
    :class:`sklearn.pipeline.Pipeline`)
    """
    if norm not in ('l1', 'l2'):
        raise ValueError("'%s' is not a supported norm" % norm)

    if axis == 0:
        sparse_format = 'csc'
    elif axis == 1:
        sparse_format = 'csr'
    else:
        raise ValueError("'%d' is not a supported axis" % axis)

    X = check_array(X, sparse_format, copy=copy)
    warn_if_not_float(X, 'The normalize function')
    if axis == 0:
        X = X.T

    if sparse.issparse(X):
        X = check_array(X, accept_sparse=sparse_format, dtype=np.float64)
        if norm == 'l1':
            inplace_csr_row_normalize_l1(X)
        elif norm == 'l2':
            inplace_csr_row_normalize_l2(X)
    else:
        if norm == 'l1':
            norms = np.abs(X).sum(axis=1)
            norms[norms == 0.0] = 1.0
        elif norm == 'l2':
            norms = row_norms(X)
            norms[norms == 0.0] = 1.0
        X /= norms[:, np.newaxis]

    if axis == 0:
        X = X.T

    return X