Esempio n. 1
0
    def get_z_latent(self, adata, encoder_labels):
        """
            Map ``adata`` in to the latent space. This function will feed data
            in encoder part of scNet and compute the latent space coordinates
            for each sample in data.

            Parameters
            ----------
            adata: :class:`~anndata.AnnData`
                Annotated data matrix to be mapped to latent space.
                Please note that `adata.X` has to be in shape [n_obs, x_dimension]
            encoder_labels: :class:`~numpy.ndarray`
                :class:`~numpy.ndarray` of labels to be fed as class' condition array.

            Returns
            -------
            adata_latent: :class:`~anndata.AnnData`
                returns Annotated data containing latent space encoding of ``adata``
        """
        adata = remove_sparsity(adata)

        encoder_inputs = [adata.X, encoder_labels]

        latent = self.encoder_model.predict(encoder_inputs)[2]
        latent = np.nan_to_num(latent, nan=0.0, posinf=0.0, neginf=0.0)

        adata_latent = anndata.AnnData(X=latent)
        adata_latent.obs = adata.obs.copy(deep=True)

        return adata_latent
Esempio n. 2
0
def ari(adata, label_key):
    """Computes Adjusted Rand Index (ARI) metric for ``adata`` given the batch column name.

        Parameters
        ----------
        adata: :class:`~anndata.AnnData`
            Annotated dataset.
        label_key: str
            Name of the column which contains information about different studies in ``adata.obs`` data frame.
        Returns
        -------
        score: float
            ARI score. A float between 0 and 1.

    """
    adata = remove_sparsity(adata)

    n_labels = len(adata.obs[label_key].unique().tolist())
    kmeans = KMeans(n_labels, n_init=200)

    labels_pred = kmeans.fit_predict(adata.X)
    labels = adata.obs[label_key].values
    labels_encoded = LabelEncoder().fit_transform(labels)

    return adjusted_rand_score(labels_encoded, labels_pred)
Esempio n. 3
0
def knn_purity(adata, label_key, n_neighbors=30):
    """Computes KNN Purity metric for ``adata`` given the batch column name.

        Parameters
        ----------
        adata: :class:`~anndata.AnnData`
            Annotated dataset.
        label_key: str
            Name of the column which contains information about different studies in ``adata.obs`` data frame.
        n_neighbors: int
            Number of nearest neighbors.
        Returns
        -------
        score: float
            KNN purity score. A float between 0 and 1.

    """
    adata = remove_sparsity(adata)
    labels = LabelEncoder().fit_transform(adata.obs[label_key].to_numpy())

    nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(adata.X)
    indices = nbrs.kneighbors(adata.X, return_distance=False)[:, 1:]
    neighbors_labels = np.vectorize(lambda i: labels[i])(indices)

    # pre cell purity scores
    scores = ((neighbors_labels - labels.reshape(-1, 1)) == 0).mean(axis=1)
    res = [np.mean(scores[labels == i])
           for i in np.unique(labels)]  # per cell-type purity

    return np.mean(res)
Esempio n. 4
0
    def predict(self, adata, encoder_labels, decoder_labels):
        """Feeds ``adata`` to scNet and produces the reconstructed data.

            Parameters
            ----------
            adata: :class:`~anndata.AnnData`
                Annotated data matrix whether in primary space.
            encoder_labels: :class:`~numpy.ndarray`
                :class:`~numpy.ndarray` of labels to be fed as scNet's encoder condition array.
            decoder_labels: :class:`~numpy.ndarray`
                :class:`~numpy.ndarray` of labels to be fed as scNet's decoder condition array.

            Returns
            -------
            adata_pred: `~anndata.AnnData`
                Annotated data of predicted cells in primary space.
        """
        adata = remove_sparsity(adata)

        encoder_labels = to_categorical(encoder_labels, num_classes=self.n_conditions)
        decoder_labels = to_categorical(decoder_labels, num_classes=self.n_conditions)

        x_hat = self.cvae_model.predict([adata.X, encoder_labels, decoder_labels])[0]

        adata_pred = anndata.AnnData(X=x_hat)
        adata_pred.obs = adata.obs
        adata_pred.var_names = adata.var_names

        return adata_pred
Esempio n. 5
0
    def evaluate(self, adata, batch_key):
        adata = remove_sparsity(adata)

        encoder_labels, _ = label_encoder(adata, self.condition_encoder, batch_key)
        decoder_labels, _ = label_encoder(adata, self.condition_encoder, batch_key)

        encoder_labels = to_categorical(encoder_labels, num_classes=self.n_conditions)
        decoder_labels = to_categorical(decoder_labels, num_classes=self.n_conditions)

        cvae_inputs = [adata.X, encoder_labels, decoder_labels]

        encoded_labels = self.cvae_model.predict(cvae_inputs)[2].argmax(axis=1)

        self._reverse_cell_type_encoder()
        labels = []
        for encoded_label in encoded_labels:
            labels.append(self.inv_cell_type_encoder[encoded_label])

        labels = np.array(labels)
        true_labels = adata.obs[batch_key].values
        accuracy = np.mean(labels == true_labels)

        print(classification_report(true_labels, labels))

        return accuracy, confusion_matrix(true_labels, labels)
Esempio n. 6
0
def silhouette(adata, group_key, metric='euclidean', scale=True):
    """
    wrapper for sklearn silhouette function values range from [-1, 1] with 1 being an ideal fit, 0 indicating overlapping clusters and -1 indicating misclassified cells
    """
    adata = remove_sparsity(adata)
    labels = adata.obs[group_key].values
    labels_encoded = LabelEncoder().fit_transform(labels)
    asw = silhouette_score(adata.X, labels_encoded, metric=metric)
    if scale:
        asw = (asw + 1) / 2
    return asw
Esempio n. 7
0
def entropy_batch_mixing(adata,
                         label_key='batch',
                         n_neighbors=50,
                         n_pools=50,
                         n_samples_per_pool=100):
    """Computes Entory of Batch mixing metric for ``adata`` given the batch column name.

        Parameters
        ----------
        adata: :class:`~anndata.AnnData`
            Annotated dataset.
        label_key: str
            Name of the column which contains information about different studies in ``adata.obs`` data frame.
        n_neighbors: int
            Number of nearest neighbors.
        n_pools: int
            Number of EBM computation which will be averaged.
        n_samples_per_pool: int
            Number of samples to be used in each pool of execution.

        Returns
        -------
        score: float
            EBM score. A float between zero and one.

    """
    adata = remove_sparsity(adata)

    neighbors = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(adata.X)
    indices = neighbors.kneighbors(adata.X, return_distance=False)[:, 1:]
    batch_indices = np.vectorize(lambda i: adata.obs[label_key].values[i])(
        indices)

    entropies = np.apply_along_axis(__entropy_from_indices,
                                    axis=1,
                                    arr=batch_indices)

    # average n_pools entropy results where each result is an average of n_samples_per_pool random samples.
    if n_pools == 1:
        score = np.mean(entropies)
    else:
        score = np.mean([
            np.mean(entropies[np.random.choice(len(entropies),
                                               size=n_samples_per_pool)])
            for _ in range(n_pools)
        ])

    return score
Esempio n. 8
0
def silhouette_batch(adata,
                     batch_key,
                     group_key,
                     metric='euclidean',
                     verbose=True,
                     scale=True):
    """
    Silhouette score of batch labels subsetted for each group.
    params:
        batch_key: batches to be compared against
        group_key: group labels to be subsetted by e.g. cell type
        metric: see sklearn silhouette score
        embed: name of column in adata.obsm
    returns:
        all scores: absolute silhouette scores per group label
        group means: if `mean=True`
    """
    adata = remove_sparsity(adata)
    glob_batches = adata.obs[batch_key].values
    batch_enc = LabelEncoder()
    batch_enc.fit(glob_batches)
    sil_all = pd.DataFrame(columns=['group', 'silhouette_score'])
    for group in adata.obs[group_key].unique():
        adata_group = adata[adata.obs[group_key] == group]
        if adata_group.obs[batch_key].nunique() == 1:
            continue
        batches = batch_enc.transform(adata_group.obs[batch_key])
        sil_per_group = silhouette_samples(adata_group.X,
                                           batches,
                                           metric=metric)
        # take only absolute value
        sil_per_group = [abs(i) for i in sil_per_group]
        if scale:
            # scale s.t. highest number is optimal
            sil_per_group = [1 - i for i in sil_per_group]
        d = pd.DataFrame({
            'group': [group] * len(sil_per_group),
            'silhouette_score': sil_per_group
        })
        sil_all = sil_all.append(d)
    sil_all = sil_all.reset_index(drop=True)
    sil_means = sil_all.groupby('group').mean()

    if verbose:
        print(f'mean silhouette per cell: {sil_means}')
    return sil_all, sil_means
Esempio n. 9
0
    def __init__(self,
                 filename: str,
                 adata: anndata.AnnData,
                 batch_key: str,
                 cell_type_key: str,
                 encoder_model: Model,
                 n_per_epoch: int = 5,
                 n_batch_labels: int = 0,
                 n_celltype_labels: int = 0,
                 clustering_scores: list_or_str = 'all'):
        super(ScoreCallback, self).__init__()
        self.adata = remove_sparsity(adata)

        self.batch_labels, _ = label_encoder(adata,
                                             le=None,
                                             condition_key=batch_key)
        self.batch_labels = np.reshape(self.batch_labels, (-1, ))
        self.batch_labels_onehot = to_categorical(self.batch_labels,
                                                  num_classes=n_batch_labels)

        self.celltype_labels, _ = label_encoder(adata,
                                                le=None,
                                                condition_key=cell_type_key)
        self.celltype_labels = np.reshape(self.celltype_labels, (-1, ))
        self.celltype_labels_onehot = to_categorical(
            self.celltype_labels, num_classes=n_celltype_labels)

        self.filename = filename
        self.encoder_model = encoder_model
        self.n_per_epoch = n_per_epoch

        self.n_batch_labels = n_batch_labels
        self.n_celltype_labels = n_celltype_labels

        self.clustering_scores = clustering_scores
        self.score_computers = {
            "asw": self.asw,
            "ari": self.ari,
            "nmi": self.nmi,
            "ebm": self.entropy_of_batch_mixing,
            "knn": self.knn_purity
        }

        self.kmeans_batch = KMeans(self.n_batch_labels, n_init=200)
        self.kmeans_celltype = KMeans(self.n_celltype_labels, n_init=200)
Esempio n. 10
0
    def annotate(self, adata, batch_key, cell_type_key):
        adata = remove_sparsity(adata)

        encoder_labels, _ = label_encoder(adata, self.condition_encoder, batch_key)
        decoder_labels, _ = label_encoder(adata, self.condition_encoder, batch_key)

        encoder_labels = to_categorical(encoder_labels, num_classes=self.n_conditions)
        decoder_labels = to_categorical(decoder_labels, num_classes=self.n_conditions)

        cvae_inputs = [adata.X, encoder_labels, decoder_labels]

        encoded_labels = self.cvae_model.predict(cvae_inputs)[2].argmax(axis=1)

        self._reverse_cell_type_encoder()
        labels = []
        for encoded_label in encoded_labels:
            labels.append(self.inv_cell_type_encoder[encoded_label])

        adata.obs[f'pred_{cell_type_key}'] = np.array(labels)
Esempio n. 11
0
def nmi_helper(adata, group1, group2, method="arithmetic"):
    """
    Normalized mutual information NMI based on 2 different cluster assignments `group1` and `group2`
    params:
        adata: Anndata object
        group1: column name of `adata.obs` or group assignment
        group2: column name of `adata.obs` or group assignment
        method: NMI implementation
            'max': scikit method with `average_method='max'`
            'min': scikit method with `average_method='min'`
            'geometric': scikit method with `average_method='geometric'`
            'arithmetic': scikit method with `average_method='arithmetic'`
            'Lancichinetti': implementation by A. Lancichinetti 2009 et al.
            'ONMI': implementation by Aaron F. McDaid et al. (https://github.com/aaronmcdaid/Overlapping-NMI) Hurley 2011
        nmi_dir: directory of compiled C code if 'Lancichinetti' or 'ONMI' are specified as `method`. Compilation should be done as specified in the corresponding README.
    return:
        normalized mutual information (NMI)
    """
    adata = remove_sparsity(adata)

    if isinstance(group1, str):
        group1 = adata.obs[group1].tolist()
    elif isinstance(group1, pd.Series):
        group1 = group1.tolist()

    labels = adata.obs[group2].values
    labels_encoded = LabelEncoder().fit_transform(labels)
    group2 = labels_encoded

    if len(group1) != len(group2):
        raise ValueError(
            f'different lengths in group1 ({len(group1)}) and group2 ({len(group2)})'
        )

    # choose method
    if method in ['max', 'min', 'geometric', 'arithmetic']:
        nmi_value = normalized_mutual_info_score(group1,
                                                 group2,
                                                 average_method=method)
    else:
        raise ValueError(f"Method {method} not valid")

    return nmi_value
Esempio n. 12
0
    def to_mmd_layer(self, adata, batch_key):
        """
            Map ``adata`` in to the MMD space. This function will feed data
            in ``mmd_model`` of scArches and compute the MMD space coordinates
            for each sample in data.

            Parameters
            ----------
            adata: :class:`~anndata.AnnData`
                Annotated data matrix to be mapped to MMD latent space.
                Please note that ``adata.X`` has to be in shape [n_obs, x_dimension]
            encoder_labels: :class:`~numpy.ndarray`
                :class:`~numpy.ndarray` of labels to be fed as scArches' encoder condition array.
            decoder_labels: :class:`~numpy.ndarray`
                :class:`~numpy.ndarray` of labels to be fed as scArches' decoder condition array.

            Returns
            -------
            adata_mmd: :class:`~anndata.AnnData`
                returns Annotated data containing MMD latent space encoding of ``adata``
        """
        adata = remove_sparsity(adata)

        encoder_labels, _ = label_encoder(adata, self.condition_encoder,
                                          batch_key)
        decoder_labels, _ = label_encoder(adata, self.condition_encoder,
                                          batch_key)

        encoder_labels = to_categorical(encoder_labels,
                                        num_classes=self.n_conditions)
        decoder_labels = to_categorical(decoder_labels,
                                        num_classes=self.n_conditions)

        cvae_inputs = [adata.X, encoder_labels, decoder_labels]

        mmd = self.cvae_model.predict(cvae_inputs)[1]
        mmd = np.nan_to_num(mmd, nan=0.0, posinf=0.0, neginf=0.0)

        adata_mmd = anndata.AnnData(X=mmd)
        adata_mmd.obs = adata.obs.copy(deep=True)

        return adata_mmd
Esempio n. 13
0
def asw(adata, label_key):
    """Computes Average Silhouette Width (ASW) metric for ``adata`` given the batch column name.

        Parameters
        ----------
        adata: :class:`~anndata.AnnData`
            Annotated dataset.
        label_key: str
            Name of the column which contains information about different studies in ``adata.obs`` data frame.
        Returns
        -------
        score: float
            ASW score. A float between -1 and 1.

    """
    adata = remove_sparsity(adata)

    labels = adata.obs[label_key].values

    labels_encoded = LabelEncoder().fit_transform(labels)

    return silhouette_score(adata.X, labels_encoded)
Esempio n. 14
0
    def predict(self, adata, encoder_labels, decoder_labels):
        """Feeds ``adata`` to scNet and produces the reconstructed data.

            Parameters
            ----------
            adata: :class:`~anndata.AnnData`
                Annotated data matrix whether in primary space.
            encoder_labels: :class:`~numpy.ndarray`
                :class:`~numpy.ndarray` of labels to be fed as class' encoder condition array.
            decoder_labels: :class:`~numpy.ndarray`
                :class:`~numpy.ndarray` of labels to be fed as class' decoder condition array.

            Returns
            -------
            adata_pred: `~anndata.AnnData`
                Annotated data of predicted cells in primary space.
        """
        adata = remove_sparsity(adata)

        encoder_labels = to_categorical(encoder_labels,
                                        num_classes=self.n_conditions)
        decoder_labels = to_categorical(decoder_labels,
                                        num_classes=self.n_conditions)
        if self.loss_fn in ['nb', 'zinb']:
            inputs = [
                adata.X, encoder_labels, decoder_labels,
                self.adata.obs[self.size_factor_key]
            ]
        else:
            inputs = [adata.X, encoder_labels, decoder_labels]

        x_hat = self.cvae_model.predict(inputs)

        adata_pred = anndata.AnnData(X=x_hat)
        adata_pred.obs = adata.obs
        adata_pred.var_names = adata.var_names

        return adata_pred
Esempio n. 15
0
def opt_louvain(adata, label_key, cluster_key, function=None, resolutions=None,
                inplace=True, plot=False, verbose=True, **kwargs):
    """
    params:
        label_key: name of column in adata.obs containing biological labels to be
            optimised against
        cluster_key: name of column to be added to adata.obs during clustering.
            Will be overwritten if exists and `force=True`
        function: function that computes the cost to be optimised over. Must take as
            arguments (adata, group1, group2, **kwargs) and returns a number for maximising
        resolutions: list if resolutions to be optimised over. If `resolutions=None`,
            default resolutions of 20 values ranging between 0.1 and 2 will be used
    returns:
        res_max: resolution of maximum score
        score_max: maximum score
        score_all: `pd.DataFrame` containing all scores at resolutions. Can be used to plot the score profile.
        clustering: only if `inplace=False`, return cluster assignment as `pd.Series`
        plot: if `plot=True` plot the score profile over resolution
    """
    adata = remove_sparsity(adata)

    if resolutions is None:
        n = 20
        resolutions = [2 * x / n for x in range(1, n + 1)]

    score_max = 0
    res_max = resolutions[0]
    clustering = None
    score_all = []

    # maren's edit - recompute neighbors if not existing
    try:
        adata.uns['neighbors']
    except KeyError:
        if verbose:
            print('computing neigbours for opt_cluster')
        sc.pp.neighbors(adata)

    for res in resolutions:
        sc.tl.louvain(adata, resolution=res, key_added=cluster_key)
        score = function(adata, label_key, cluster_key, **kwargs)
        score_all.append(score)
        if score_max < score:
            score_max = score
            res_max = res
            clustering = adata.obs[cluster_key]
        del adata.obs[cluster_key]

    if verbose:
        print(f'optimised clustering against {label_key}')
        print(f'optimal cluster resolution: {res_max}')
        print(f'optimal score: {score_max}')

    score_all = pd.DataFrame(zip(resolutions, score_all), columns=('resolution', 'score'))
    if plot:
        # score vs. resolution profile
        sns.lineplot(data=score_all, x='resolution', y='score').set_title('Optimal cluster resolution profile')
        plt.show()

    if inplace:
        adata.obs[cluster_key] = clustering
        return res_max, score_max, score_all
    else:
        return res_max, score_max, score_all, clustering