Exemple #1
0
def maxLikGlobalDimEst(adata, k=20, nrpcs=50, rlib_loc=''):
    """
    Estimates the intrinsic dimensionality of the data, based on the 'maxLikGlobalDimEst' function of the 'intrinsicDimension' R package.
    
    Parameters
    ----------
    adata: `AnnData`
        AnnData object of RNA counts.
    k: `
        Number of neighbours to use in the 'maxLikGlobalDimEst'. Choosing k between 10 and 20 generally yields the best results. 
    nrpcs:
        Number of PCs to compute initially before estimating the dimensionality. Consider increasing it for very high dimensional data. 
    rlib_loc: `str`
        R library location that will be added to the default .libPaths() to locate the required packages. 
  
    Returns
    -------
    Returns the estimated intrinsic dimensionality of the data that can be used for graph clustering. 
    """

    rpy2_import = importlib.util.find_spec('rpy2')
    if rpy2_import is None:
        raise ImportError(
            "maxLikGlobalDimEst requires rpy2. Install with pip install rpy2")
    import rpy2.robjects as ro
    import anndata2ri
    from scipy.sparse import issparse

    ro.globalenv['rlib_loc'] = rlib_loc
    ro.r('.libPaths(c(rlib_loc, .libPaths()))')
    ro.r('suppressPackageStartupMessages(library(intrinsicDimension))')

    random_state = 0
    print('Using random_state = 0 for all the following calculations')
    sc_pca(adata, svd_solver='arpack', random_state=0, n_comps=nrpcs)
    adata.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat

    ro.globalenv['pcs'] = adata.obsm['X_pca']
    ro.globalenv['k'] = k
    ro.r('n <- maxLikGlobalDimEst(as.matrix(pcs), k=k, unbiased=TRUE)')

    ro.r('message("Estimated dimensionality: ", round(n$dim.est))')

    n_dimest = ro.r('round(n$dim.est)')

    return int(n_dimest[0])
Exemple #2
0
def pca_neighbors_umap(adata,
                       results_folder,
                       nrpcs=50,
                       nrpcs_neigh=None,
                       nrneigh=10,
                       method='NULL'):
    '''
    parameters
    ----------
    adata: `ÀnnData`
        AnnData object that is to be exported
    results_folder: `str`
        path to the results folder
    nrpcs: int | nrpcs = 50
        number of principle components to calculate
    nrpcs_neigh: int | nrpcs_neigh = 50
        number of principle components to use for nearest neighbor calculation.
        When set to None the number is chosen automatically. For .n_vars < 50, .X is used, otherwise ‘X_pca’ is used with 50 components.
    nrneigh: int | nrpcs = None
        number of principle components to calculate
    method: `str`
        Method for nearest neighbor calculation.  Can be set to 'NULL' or bbknn
    '''
    start = time()
    random_state = 0
    print('Using random_state = 0 for all the following calculations')

    # PCA
    sc_pca(adata,
           svd_solver='arpack',
           random_state=random_state,
           n_comps=nrpcs)
    adata.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat
    print(
        "PCA calculated using svd_solver = 'arpack'. PCA multiplied by -1 to match Seurat output."
    )

    # generate plot of PCA
    fig, (ax1, ax2) = subplots(ncols=2, nrows=1)
    fig.set_figwidth(12)
    fig.set_figheight(6)
    fig.tight_layout(pad=4.5)

    cumulative_variance = cumsum(adata.uns['pca']['variance_ratio'])
    x = list(range(nrpcs))
    data = DataFrame({'x': x, 'y': cumulative_variance})

    ax1.scatter(x=x, y=cumulative_variance)
    ax1.set_ylabel('cumulative explained variance')
    ax1.set_xlabel('PCA components')
    ax1.set_title('cumulative explained variance (as ratio)')

    sc_pl_pca(adata, ax=ax2)
    fig.savefig(join(results_folder, 'figures', 'PCA.png'))

    # display(fig)
    # Inner function for simple pca-umap witrhout any correctuon
    def run_no_correction():
        neighbors(adata,
                  n_neighbors=nrneigh,
                  random_state=random_state,
                  n_pcs=nrpcs_neigh)
        print('Nearest neighbors calculated with n_neighbors = ' +
              str(nrneigh))
        if nrpcs_neigh == 0:
            print('Using .X to calculate nearest neighbors instead of PCs.')
            logging.info(
                'Neighborhood analysis performed with .X instead of PCs.')

    # neighbors
    if (method == 'bbknn'):
        if ('batch' in adata.obs.columns):
            if len(set(adata.obs.get('batch'))) == 1:
                print(
                    'column "batch" only contains one value. We cannot correct for those; BBKNN is NOT applied.'
                )
                run_no_correction()
            else:
                bbknn.bbknn(adata)
        else:
            sys.exit(
                'bbknn correction requires a column "batch" in the observations.'
            )
    else:
        run_no_correction()
    # umap
    sc_umap(adata, random_state=random_state)
    print('UMAP coordinates calculated.')

    logging.info('Neighborhood analysis completed, and UMAP generated.')
    logging.info(
        '\t Time for PCA, nearest neighbor calculation and UMAP generation: ' +
        str(round(time() - start, 3)) + 's')

    # export metadata
    start = time()
    export_metadata(adata,
                    basepath=results_folder,
                    n_pcs=3,
                    umap=True,
                    tsne=False)
    logging.info(
        'Metadata containing 3 PCAs and UMAP coordinates exported successfully to file.'
    )
    logging.info('Time for export: ' + str(round(time() - start, 3)) + 's')

    return (adata)
Exemple #3
0
def recluster(adata,
              celltype,
              celltype_label='leiden',
              min_mean=0.0125,
              max_mean=4,
              min_disp=0.5,
              resolution=1.0,
              regress_out_key=None,
              random_seed=0,
              show_plot_filter=False,
              method='leiden',
              batch_key=None,
              n_shared=2):
    """ Perform subclustering on specific celltype to identify subclusters.

    Extract all cells that belong to the pre-labeled celltype into a new
    data subset. This datasubset is initialized with the raw data contained in adata.raw. New highly
    variable genes are selected and a new clustering is performed. The function returns the adata
    subset with the new clustering annotation.

    This can be performed on leiden clusters by setting celltype_label = 'leiden' and passing the
    clusters that are to be selected for reclustering as strings or tuple of strings to the parameter
    celltype.

    Parameters
    ----------
    adata:
        the complete AnnData object of the Dataset.
    celltype: `str` or (`str`)
        string identifying the cluster which is to be filtered out, if more than one is to be selected please
        pass them as a tuple not as a list!
    celltype_label: `str` | default = 'leiden'
        string identifying which column in adata.obs will be matching with the celltype argument.
    min_mean: `float` | default = 0.0125
        the minimum gene expression a gene must have to be considered highly variable
    max_mean: `float` | default = 4
        the maximum gene expression a gene can have to be considered highly variable
    min_disp: `float` | default = 0.5
        the minimum dispersion a gene must have to be considered highly variable
    regress_out_key: `list of str` | default = None
        A list of string identifiers of the adata.obs columns that should be regressed out before
        performing clustering. If None then no regress_out is calculated.
    random_seed: `int` | default = 0
        the random seed that is used to produce reproducible PCA, clustering and UMAP results
    show_plot_filter: `bool` | default = False
        boolian value indicating if a plot showing the filtering results for highly variable gene
        detection should be displayed or not
    method: `str` | default = 'leiden'
        clustering method to use for the reclustering of the datasubset. Possible:louvain/leiden
    batch_key: `str` | default = None
        Specify a batch key if the HVG calculation should be done per batch
    n_share: `int` | default = 3
        Divide the nr. of batched by this nr. to get the shared HVGs considered (e.g. >=1/3 of samples)
        
    Returns
    -------

    AnnData object containing the subcluster annotated with PCA, nearest neighbors, louvain cluster,
    and UMAP coordinates.

    Examples
    --------

    For a more detailed example of the entire reclustering process please refer to the code examples.

    >>> import besca as bc
    >>> import scanpy as sc
    >>> adata = bc.datasets.pbmc3k_processed()
    >>> adata_subset = bc.tl.rc.recluster(adata, celltype=('0', '1', '3', '6'), resolution = 1.3)
    >>> sc.pl.umap(adata_subset, color = ['leiden', 'CD3G', 'CD8A', 'CD4', 'IL7R', 'NKG7', 'GNLY'])

    """
    if (not method in ['leiden', 'louvain']):
        raise ValueError("method argument should be leiden or louvain")
    if type(celltype) == str:
        cluster_subset = _subset_adata(
            adata,
            adata.obs.get(celltype_label) == celltype)
    elif type(celltype) == tuple:
        filter = adata.obs.get(celltype_label) == 'NONE'
        for i in range(len(celltype)):
            filter = filter | (adata.obs.get(celltype_label) == celltype[i])
        cluster_subset = _subset_adata(adata, filter)
    else:
        sys.exit('specify cluster input as a string or tuple')

    cluster_subset.raw = cluster_subset

    #identify highly variable genes
    sc_highly_variable_genes(cluster_subset,
                             min_mean=min_mean,
                             max_mean=max_mean,
                             min_disp=min_disp,
                             inplace=True,
                             batch_key=batch_key)

    if (batch_key != None):
        hvglist = cluster_subset.var['highly_variable'].copy()
        hvglist.loc[cluster_subset.var['highly_variable_nbatches'] >=
                    len(set(cluster_subset.obs[batch_key])) /
                    n_shared, ] = True
        cluster_subset.var['highly_variable'] = hvglist.copy()

    if show_plot_filter:
        pl_highly_variable_genes(cluster_subset, show=True)
    print('In total', str(sum(cluster_subset.var.highly_variable)),
          'highly variable genes selected within cluster')

    #apply filter
    cluster_subset = _subset_adata(cluster_subset,
                                   cluster_subset.var.highly_variable,
                                   axis=1,
                                   raw=False)

    #perform further processing
    # log1p(cluster_subset) # data already logged
    if regress_out_key is not None:
        regress_out(cluster_subset, keys=regress_out_key)
    sc_scale(cluster_subset, max_value=10)
    sc_pca(
        cluster_subset, random_state=random_seed, svd_solver='arpack'
    )  #using `svd_solver='arpack' ensures that the PCA leads to reproducible results
    neighbors(cluster_subset, n_neighbors=10, random_state=random_seed)
    umap(cluster_subset, random_state=random_seed)
    if method == 'louvain':
        louvain(cluster_subset,
                resolution=resolution,
                random_state=random_seed)
    if method == 'leiden':
        leiden(cluster_subset, resolution=resolution, random_state=random_seed)

    return (cluster_subset)