Esempio n. 1
0
    def load_data(data):
        if isfile(data):
            name, extension = splitext(data)
            if extension == ".h5ad":
                adata = sc.read_h5ad(data)
            elif extension == ".loom":
                adata = sc.read_loom(data)
            else:
                raise click.FileError(data, hint="does not have a valid extension [.h5ad | .loom]")
        elif isdir(data):
            if not data.endswith(sep):
                data += sep
            adata = sc.read_10x_mtx(data)
        else:
            raise click.FileError(data, hint="not a valid file or path")

        if not set_obs_names == "":
            if set_obs_names not in adata.obs_keys():
                raise click.UsageError(f"obs {set_obs_names} not found, options are: {adata.obs_keys()}")
            adata.obs_names = adata.obs[set_obs_names]
        if not set_var_names == "":
            if set_var_names not in adata.var_keys():
                raise click.UsageError(f"var {set_var_names} not found, options are: {adata.var_keys()}")
            adata.var_names = adata.var[set_var_names]
        if make_obs_names_unique:
            adata.obs.index = make_index_unique(adata.obs.index)
        if make_var_names_unique:
            adata.var.index = make_index_unique(adata.var.index)
        if not adata._obs.index.is_unique:
            click.echo("Warning: obs index is not unique")
        if not adata._var.index.is_unique:
            click.echo("Warning: var index is not unique")
        return adata
Esempio n. 2
0
def loom2h5ad(loomFile, h5adFile):
    adata = sc.read_loom(loomFile,
                         sparse=True,
                         X_name='spliced',
                         obs_names='cell',
                         var_names='geneID',
                         dtype='float32')
    adata.var.set_index(['Gene'], inplace=True)
    adata.obs['CellID'] = adata.obs['CellID'].apply(
        lambda x: x[1:].replace('_', '-'))
    adata.obs.set_index(['CellID'], inplace=True)
    adata.obs.rename(columns={
        'seurat_clusters': 'louvain',
        'nFeature_RNA': 'n_genes',
        'nCount_RNA': 'n_counts',
        'coor_x': 'x',
        'coor_y': 'y'
    },
                     inplace=True)
    adata.obsm['X_pca'] = adata.obsm.pop("pca_cell_embeddings")
    adata.obsm['X_umap'] = adata.obsm.pop("umap_cell_embeddings")
    adata.varm['PCs'] = adata.varm.pop("pca_feature_loadings")
    del adata.layers['scale_data']
    rawdata = anndata.AnnData(X=adata.X,
                              obs=adata.obs[['coor_x', 'coor_y']],
                              var=adata.var[['Selected']])
    adata.raw = rawdata
    adata.layers['raw_data'] = adata.X
    adata.X = adata.layers['norm_data']
    del adata.layers['norm_data']
    adata.write(h5adFile)
Esempio n. 3
0
def read_adata(
        gex_data, # filename
        gex_data_type # string describing file type
):
    ''' Split this out so that other code can use it. Read GEX data
    '''
    print('reading:', gex_data, 'of type', gex_data_type)
    if gex_data_type == 'h5ad':
        adata = sc.read_h5ad( gex_data )

    elif gex_data_type == '10x_mtx':
        adata = sc.read_10x_mtx( gex_data )

    elif gex_data_type == '10x_h5':
        adata = sc.read_10x_h5( gex_data, gex_only=True )

    elif gex_data_type == 'loom':
        adata = sc.read_loom( gex_data )

    else:
        print('unrecognized gex_data_type:', gex_data_type, "should be one of ['h5ad', '10x_mtx', '10x_h5', 'loom']")
        exit()

    if adata.isview: # this is so weird
        adata = adata.copy()
    return adata
Esempio n. 4
0
def import_scv_loom(loom):
    print('Importing loom file...')
    adata = sc.read_loom(loom, sparse=True, X_name='spliced')
    print("Column attributes:", adata.obs_keys())
    print("Row attributes:", adata.var_keys())
    print('Shape: ', adata.shape)
    return adata
Esempio n. 5
0
def read_velocyto_loom(fn, args, **kw):
    data = sc.read_loom(fn, var_names='Accession')
    data.var.rename(columns={'Gene': 'gene_symbols'}, inplace=True)
    sample_id = os.path.splitext(os.path.basename(fn))[0]
    data.obs['sample_id'] = sample_id
    data.obs['sample_id'] = data.obs['sample_id'].astype('category')
    sv.utils.clean_obs_names(data)
    data.obs_names = [i + '-' + sample_id for i in data.obs_names]
    data.var.index.name = 'gene_ids'
    return data
Esempio n. 6
0
def _read_obj(input_obj, input_format='anndata', **kwargs):
    if input_format == 'anndata':
        adata = sc.read(input_obj, **kwargs)
    elif input_format == 'loom':
        adata = sc.read_loom(input_obj, **kwargs)
    else:
        raise NotImplementedError(
            'Unsupported input format: {}'.format(input_format))
    adata.var = _fix_booleans(adata.var)
    adata.obs = _fix_booleans(adata.obs)

    return adata
Esempio n. 7
0
 def _load(self):
     if self.input_format == 'h5ad':
         return sc.read_h5ad(self.input_filename)
     elif self.input_format == 'loom':
         return sc.read_loom(self.input_filename)
     elif self.input_format == '10x':
         return self._load_10x()
     elif self.input_format == 'mtx' or self.input_format == 'mex':
         return sc.read_mtx(self.input_filename)
     elif self.input_format == 'bustools-count':
         return self._load_bustools_count()
     return None
Esempio n. 8
0
def fitting_gamma(adata, path):
    bdata = scp.read_loom(path["path"], sparse=True, X_name='spliced')
    bdata = bdata[adata.obs.index, ]
    bdata.obs['leiden'] = adata.obs['leiden']
    bdata.write_loom(path['loom'])
    vlm = vcy.VelocytoLoom(path["loom"])
    # gene filtering
    vlm.score_detection_levels(min_cells_express=20, min_expr_counts=50)
    vlm.filter_genes(by_detection_levels=True)
    vlm.score_detection_levels(min_expr_counts=0,
                               min_cells_express=0,
                               min_expr_counts_U=25,
                               min_cells_express_U=20)
    vlm.filter_genes(by_detection_levels=True)
    # print(vlm.S.shape)

    # size normalization
    vlm._normalize_S(relative_size=vlm.initial_cell_size,
                     target_size=np.median(vlm.initial_cell_size))
    vlm._normalize_U(relative_size=vlm.initial_Ucell_size,
                     target_size=np.median(vlm.initial_Ucell_size))

    # PCA
    vlm.perform_PCA()
    pcn = vlm.S.shape[0]
    # plt.plot(np.cumsum(vlm.pca.explained_variance_ratio_)[:100])
    n_comps = np.where(
        np.diff(np.diff(np.cumsum(vlm.pca.explained_variance_ratio_)) > 0.002)
    )[0][0]
    plt.axvline(n_comps, c="k")
    print("n_comps", n_comps)

    # KNN smoothing
    k = int(vlm.S.shape[1] * 0.025)
    print("k", k)
    vlm.knn_imputation(n_pca_dims=n_comps,
                       k=k,
                       balanced=True,
                       b_sight=k * 8,
                       b_maxl=k * 4,
                       n_jobs=16)

    # fit gamma
    vlm.fit_gammas()

    # estimate velocity
    vlm.predict_U()
    vlm.calculate_velocity()
    vlm.calculate_shift(assumption="constant_velocity")
    vlm.extrapolate_cell_at_t(delta_t=1., )
    print("velocity Done!")
    return vlm
Esempio n. 9
0
def loadData(SJout, loom_data, gene_counts, changeo_db):
    ab_tx, switch_tx = loadSJoutIGH(SJout)
    ###
    # Assemblies (Changeo)
    changeo_db_H = loadChangeoDbH(changeo_db)

    ###
    # Gene counts
    ###
    print("loading anndata")
    adata = 'placeholder'
    loom_adata = sc.read_loom(loom_data)
    adata = sc.read_h5ad(gene_counts)
    return ab_tx, switch_tx, adata, loom_adata, changeo_db_H
Esempio n. 10
0
def getdata(path):
    """Load data from file path
    
    Parameters:
    ----------
    path: str
        absolute file path (.loom).

    Returns:
    -------
    adata: scanpy.adata
        scanpy adata object
    """
    adata = sc.read_loom(path, var_names='var_names', obs_names='obs_names')
    return (adata)
Esempio n. 11
0
    def _load_loom(self, markers):
        if self.args.indir.endswith(".loom"):
            loom_file = self.args.indir
        else:
            loom_file = os.path.join(self.args.indir, "data.loom")
        if not os.path.isfile(loom_file):
            raise ScelVisException("cannot find loom file at %s" % loom_file)
        logger.info("Reading data from %s", loom_file)
        ad = sc.read_loom(
            loom_file, X_name="spliced" if self.args.use_raw else "norm_data")
        for layer in list(ad.layers.keys()):
            logger.info("Removing unused layer %s" % layer)
            del ad.layers[layer]
        for col in markers.columns:
            ad.uns["marker_" + col] = markers[col].values

        return ad
Esempio n. 12
0
    def renameV(self, loom_file):
        """\
            Load and rename the velocyto ouput data

            Parameters
            ----------
            loom_file
                Required
                The file containg the .loom file to be read in,
            replace
                Required
                String to replace to 'possorted_genome_bam_GGCIQ:' with in cell barcodes
            -------
        """

        self.adataV = sc.read_loom(loom_file, var_names='Gene')

        display(self.adataV.obs)
        remove = input('Phase to remove in cell barcodes: ')
        replace = input(
            'Phrase to replace the removed phrase of cell barcodes: ')
        self.adataV.obs.index = self.adataV.obs.index.str.replace(
            remove, replace)
        self.adataV.obs.index = self.adataV.obs.index.str.replace('x', '')
Esempio n. 13
0
def initialFiltering(args):

    sc.logging.print_versions()

    ##################################################
    # read unfiltered data from a loom file
    adata = sc.read_loom(args.loom_input)

    ##################################################
    # basic filtering / stats

    nCountsPerGene = np.sum(adata.X, axis=0)
    nCellsPerGene = np.sum(adata.X > 0, axis=0)

    # Show info
    print("Number of counts (in the dataset units) per gene:",
          nCountsPerGene.min(), " - ", nCountsPerGene.max())
    print("Number of cells in which each gene is detected:",
          nCellsPerGene.min(), " - ", nCellsPerGene.max())

    nCells = adata.X.shape[0]

    # pySCENIC thresholds
    minCountsPerGene = 3 * .01 * nCells  # 3 counts in 1% of cells
    print("minCountsPerGene: ", minCountsPerGene)

    minSamples = .01 * nCells  # 1% of cells
    print("minSamples: ", minSamples)

    ####################
    # initial cuts
    sc.pp.filter_cells(adata, min_genes=args.thr_min_genes)
    sc.pp.filter_genes(adata, min_cells=args.thr_min_cells)

    ####################
    # mito and genes/counts cuts
    mito_genes = adata.var_names.str.startswith('MT-')
    # for each cell compute fraction of counts in mito genes vs. all genes
    if (sum(mito_genes) == 0):
        adata.obs['percent_mito'] = 0.0
    else:
        adata.obs['percent_mito'] = np.ravel(
            np.sum(np.asmatrix(adata[:, mito_genes].X), axis=1)) / np.ravel(
                np.sum(adata.X, axis=1))
    # add the total counts per cell as observations-annotation to adata
    adata.obs['n_counts'] = np.ravel(adata.X.sum(axis=1))

    ####################
    # plotting:
    sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'],
                 jitter=0.4,
                 multi_panel=True)

    sc.pl.scatter(adata, x='n_counts', y='percent_mito')
    sc.pl.scatter(adata, x='n_counts', y='n_genes')

    adata = adata[adata.obs['n_genes'] < args.thr_n_genes, :]
    adata = adata[adata.obs['percent_mito'] < args.thr_pct_mito, :]

    ##################################################
    # output to loom file:
    row_attrs = {
        "Gene": np.array(adata.var_names),
    }
    col_attrs = {
        "CellID": np.array(adata.obs_names),
        "nGene": np.array(np.sum(adata.X.transpose() > 0, axis=0)).flatten(),
        "nUMI": np.array(np.sum(adata.X.transpose(), axis=0)).flatten(),
    }

    lp.create(args.loom_filtered, adata.X.transpose(), row_attrs, col_attrs)
    adata.write(args.anndata)
Esempio n. 14
0
    adata.var.index = adata.var.index.astype(str)
    # Check if var index is unique
    if len(np.unique(adata.var.index)) < len(
            adata.var.index) and not args.make_var_index_unique:
        raise Exception(
            "VSN ERROR: AnnData var index is not unique. This can be fixed by making it unique. To do so update the following param 'makeVarIndexUnique = true' (under params.sc.sc_file_converter) in your config."
        )
    if len(np.unique(adata.var.index)) < len(
            adata.var.index) and args.make_var_index_unique:
        adata.var_names_make_unique()
        print("Making AnnData var index unique...")
    # Sort var index
    adata = adata[:, np.sort(adata.var.index)]
    adata.write_h5ad(filename="{}.h5ad".format(FILE_PATH_OUT_BASENAME))
elif INPUT_FORMAT == 'loom' and OUTPUT_FORMAT == 'h5ad':
    adata = sc.read_loom(FILE_PATH_IN, sparse=True, validate=False)
    adata = add_sample_id(adata=adata, args=args)
    # If is tag_cell_with_sample_id is given, add the sample ID as suffix
    if args.tag_cell_with_sample_id:
        adata = tag_cell(adata=adata,
                         tag=args.sample_id,
                         remove_10x_gem_well=args.remove_10x_gem_well)
    adata.var.index = adata.var.index.astype(str)
    # Check if var index is unique
    if len(np.unique(adata.var.index)) < len(
            adata.var.index) and not args.make_var_index_unique:
        raise Exception(
            "VSN ERROR: AnnData var index is not unique. This can be fixed by making it unique. To do so update the following param 'makeVarIndexUnique = true' (under params.sc.sc_file_converter) in your config."
        )
    if len(np.unique(adata.var.index)) < len(
            adata.var.index) and args.make_var_index_unique:
Esempio n. 15
0
    sc.pp.regress_out(adata, ['orig_ident'])
    #scale data
    sc.pp.scale(adata, max_value=10)
    #calculate PCA
    sc.tl.pca(adata, svd_solver='arpack', n_comps=50)
    return adata


#define marker genes for plotting
genes = [
    'Muc2', 'Aqp8', 'Krt20', 'Isg15', 'Reg3g', 'Chga', 'Top2a', 'Lgr5',
    'Smoc2', 'Ascl2', 'Cdk4', 'Rps15a'
]

cycling = sc.read_loom(
    "/Users/fr7/git_repos/single_cell/experiment_4/FINAL/integrated/control/cycling.loom",
    sparse=True)

sc.pp.filter_genes(cycling, min_counts=1)
cycling = pre_process(cycling)

sc.pl.pca_variance_ratio(cycling, log=True, n_pcs=50, save=".cycling.png")

sc.pp.neighbors(cycling, n_pcs=35, n_neighbors=50)

sc.tl.umap(cycling)
sc.pl.umap(cycling, color=genes, save=".cycling.png")

sc.tl.leiden(cycling, resolution=1)
sc.pl.umap(cycling,
           color=['leiden', 'ClusterName'],
Esempio n. 16
0
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Custom classes for classification
import classifiersV3 as cl

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.use('Agg')
from matplotlib.backends.backend_pdf import PdfPages

# Load up loom file
print('\nLoading data...')
ccAF1_scanpy = sc.read_loom('data/cellcycle_int_integrated.loom')

# Parameters for CV
numSamples = 1000
nfolds = 100

# Initialize helper vars/indices for subsetting data (train/test)
nCells = ccAF1_scanpy.shape[0]
allInds = np.arange(0, nCells)

# Precompute testing and training data sets
trainInds = []
truelab = []
testInds = []
mapMe_ACTINN = []
errorACTINN = []
Esempio n. 17
0
    adata.var.index = adata.var.index.astype(str)
    # Check if var index is unique
    if len(np.unique(adata.var.index)) < len(
            adata.var.index) and not args.make_var_index_unique:
        raise Exception(
            "VSN ERROR: AnnData var index is not unique. This can be fixed by making it unique. To do so update the following param 'makeVarIndexUnique = true' (under params.sc.sc_file_converter) in your config."
        )
    if len(np.unique(adata.var.index)) < len(
            adata.var.index) and args.make_var_index_unique:
        adata.var_names_make_unique()
        print("Making AnnData var index unique...")
    # Sort var index
    adata = adata[:, np.sort(adata.var.index)]
    adata.write_h5ad(filename="{}.h5ad".format(FILE_PATH_OUT_BASENAME))
elif INPUT_FORMAT == 'loom' and OUTPUT_FORMAT == 'h5ad':
    adata = sc.read_loom(FILE_PATH_IN, sparse=False)
    adata = add_sample_id(adata=adata, args=args)
    # If is tag_cell_with_sample_id is given, add the sample ID as suffix
    if args.tag_cell_with_sample_id:
        adata = tag_cell(adata=adata, tag=args.sample_id)
    adata.var.index = adata.var.index.astype(str)
    # Check if var index is unique
    if len(np.unique(adata.var.index)) < len(
            adata.var.index) and not args.make_var_index_unique:
        raise Exception(
            "VSN ERROR: AnnData var index is not unique. This can be fixed by making it unique. To do so update the following param 'makeVarIndexUnique = true' (under params.sc.sc_file_converter) in your config."
        )
    if len(np.unique(adata.var.index)) < len(
            adata.var.index) and args.make_var_index_unique:
        adata.var_names_make_unique()
        print("Making AnnData var index unique...")
Esempio n. 18
0
# write h5ad file
rna_adata.write(results_file)

#==========================
# use seurat for clustering
#==========================
'''
seurat.ipynb
'''

#==========================
#import clusters from seurat
#==========================

import scanpy as sc
seurat = sc.read_loom('seurate_cluster_RNA.loom', sparse=False, obs_names='CellID', var_names='Gene', dtype='float32')
seurat.obs['clusters']=seurat.obs['seurat_clusters']
seurat_raw=seurat

seurat.var.index=[gene_info['gene_id_to_name'][gene] for gene in seurat.var.index.values]
seurat.var_names_make_unique()

sc.tl.rank_genes_groups(seurat, 'clusters', method='wilcoxon')
sc.pl.rank_genes_groups(seurat, n_genes=25, sharey=False)

# plot heatmap of specific genes
specific_genes=pd.DataFrame(seurat.uns['rank_genes_groups']['names']).loc[0:20,:].T.values.flatten()
X=pd.DataFrame(seurat[:,specific_genes].layers['norm_data']).T
X.index=specific_genes
X.columns=seurat.obs['clusters']
from scale_plot_feature import plot_heatmap
Esempio n. 19
0
    sc.pp.regress_out(adata, ['orig_ident'])
    #scale data
    sc.pp.scale(adata, max_value=10)
    #calculate PCA
    sc.tl.pca(adata, svd_solver='arpack', n_comps=100)
    return adata


#define marker genes for plotting
genes = [
    'Muc2', 'Aqp8', 'Krt20', 'Isg15', 'Reg3g', 'Chga', 'Top2a', 'Lgr5',
    'Smoc2', 'Ascl2', 'Fer1l6'
]

control = sc.read_loom(
    "/Users/fr7/git_repos/single_cell/experiment_4/FINAL/merge/control.loom",
    sparse=True)

#pre-process
control = pre_process(control)

#plots to select number of components for clustering
sc.pl.pca_variance_ratio(control, log=True, n_pcs=100)

#compute the neighbourhood graphs
sc.pp.neighbors(control, n_pcs=35,
                n_neighbors=4)  #might want to play with n_neighbours

#plot UMAPs
sc.tl.umap(control)
sc.pl.umap(control, color=genes, save=".control.png")
Esempio n. 20
0
#!/data/mcgaugheyd/conda/envs/scVI/bin/python

import sys
import scarches as sca
import scanpy as sc
import pandas as pd

args = sys.argv

adata = sc.read_loom(args[1])
print(adata.X)

n_epochs = int(args[2])
lr = float(args[3])
if args[4] == 'True':
    use_cuda = True
else:
    use_cuda = False

n_hidden = int(args[5])
n_latent = int(args[6])
n_hvg = int(args[7])

adata.obs['batch'] = adata.obs['batch'].astype('category')

print(str(n_epochs) + " epochs")
print(str(lr) + " learning rate")
print(str(n_hidden) + " hidden layers")
print(str(n_latent) + " latent dims")
print(str(n_hvg) + " HVG")
Esempio n. 21
0
hgncAliasEnsmbl = {}
for i in hgncEnsmbl.loc[~hgncEnsmbl['Alias symbols'].isnull()].index:
    splitUp = hgncEnsmbl.loc[i, 'Alias symbols'].split(', ')
    ensmbl = hgncEnsmbl.loc[i, 'Ensembl ID(supplied by Ensembl)']
    for j in splitUp:
        hgncAliasEnsmbl[j] = ensmbl

# Load up loom file
for set1 in [
        'GSE70630', 'GSE89567', 'GSE854465_all', 'GSE131928_10X',
        'GSE131928_Smartseq2', 'Bhaduri', 'GSE139448', 'GSE102130'
]:
    if not exists('results/table1_' + str(set1) + '.csv'):
        print('\nLoading ' + set1 + ' data...')
        set1_scanpy = sc.read_loom('data/forClassification/gliomas/' + set1 +
                                   '.loom')
        set1_scanpy = set1_scanpy[:, [
            True if i in hgncEnsmbl.index or i in hgncPrevEnsmbl or i in
            hgncAliasEnsmbl else False for i in set1_scanpy.var_names
        ]]
        # Convert to ensmbl
        convertMe_hgnc = pd.DataFrame(np.nan,
                                      index=set1_scanpy.var_names,
                                      columns=['ensembl'])
        numConverted_hgnc = 0
        missed = []
        for j in set1_scanpy.var_names:
            if j in hgncEnsmbl.index:
                convertMe_hgnc.loc[j, 'ensembl'] = hgncEnsmbl.loc[
                    j, 'Ensembl ID(supplied by Ensembl)']
            elif j in hgncPrevEnsmbl:
Esempio n. 22
0
import sys
import tnn
from tnn.tnn import *
import scanpy as sc

args = sys.argv

adata = sc.read_loom(args[1], sparse = True)
dims = float(args[2])
if type == 'counts':
	sc.pp.normalize_total(adata, target_sum=1e4)
	sc.pp.log1p(adata)
	adata.raw = adata
# pca
sc.tl.pca(adata)

if 'onlyWE' in args[1]:
	k = 10
else:
	k = 50 

# semi-supervised
adata_tnn = adata.copy()
tnn_model = TNN(k = k, 
        distance = 'pn', 
        batch_size = 64, 
        n_epochs_without_progress = 4, 
        epochs = 50, 
        embedding_dims=dims,
        approx = True)
#if 'onlyDROP' in args[1]:
Esempio n. 23
0
from scvi.models import SCANVI
from scvi.dataset.anndataset import AnnDatasetFromAnnData
from scvi.inference import UnsupervisedTrainer, JointSemiSupervisedTrainer, SemiSupervisedTrainer
import scanpy as sc
import numpy as np
import torch

in_loom_train = "/Users/pgarcia-nietochanzuckerberg.com/projects/cell_type_transfer/pancreas/data/paper.loom"
#in_loom_train = "/Users/pgarcia-nietochanzuckerberg.com/projects/cell_type_transfer/pancreas/data/paper_.loom"
in_loom_test = "/Users/pgarcia-nietochanzuckerberg.com/projects/cell_type_transfer/pancreas/data/hca_cellTypes.loom"

#---------------------------
# Example of pre-processing

adata = sc.read_loom(in_loom_train)
adata_test = sc.read_loom(in_loom_test)

adata.var_names_make_unique()
adata_test.var_names_make_unique()


# PRE-PROCESS
# First find do log1p
sc.pp.filter_genes(adata, min_cells=int(0.05 * adata.shape[0]))
sc.pp.log1p(adata)
sc.pp.log1p(adata_test)
# Then find variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="seurat")
# Label test data
adata.obs["scanvi_test"] = False
adata_test.obs["scanvi_test"] = True
Esempio n. 24
0
    def load_genotypes(self, suffix:str, from_loom:bool = False):
        # load genotyping data from hdf5 compressed file 
        self.filename = os.path.join(self.path['out_folder'], suffix + '_genotype.loom')
        
        if from_loom:
            try:
                self.genotypes = sc.read_loom(self.filename)
            except ValueError:
                print('loom file not found, check paths or try to load from raw files')
            return
        with h5py.File(self.path['genotypes_path'], 'r') as f:
            
            # import hdf5 layers into arrays  
            cell_barcodes = copy.deepcopy([c.decode('utf8') for c in f['CELL_BARCODES']])
            variants = copy.deepcopy([v.decode('utf8') for v in f['VARIANTS']])
            
            # cells with no abs, no genotype
            no_abs = list(set(cell_barcodes) - set(np.array(self.pat_cell_barcodes_ab[0])))
            
            genotypes = pd.DataFrame(np.transpose(f['GT']), index=cell_barcodes, columns=variants).sort_index()
            genotypes.index.name = 'cell_barcode'
            sample_name = ['abseq' + c.split('-')[-1] for c in genotypes.index]
            genotypes['sample_name'] = sample_name
            genotypes.set_index([genotypes.index, 'sample_name'], inplace=True)
            self.genotypes_noAb = genotypes.loc[no_abs] # may have to create loom file for this too, won't be loaded if starting from loom
            genotypes = genotypes.drop(index=no_abs)
            genotypes[genotypes.isnull()] = 3
            
            #adata = ad.AnnData(np.array(genotypes), dtype='int')
            #adata.obs['cell_barcode'] = genotypes.index
            #adata.var['variant_name'] = genotypes.columns
            #adata.filename = os.path.join(self.path['out_folder'], self.filename)
            loompy.create(self.filename ,np.array(genotypes), {'cell_barcode':np.array(genotypes.index)}, {'sample_name':np.array(genotypes.columns)})
            del genotypes

            quality = pd.DataFrame(np.transpose(f['GQ']), index=cell_barcodes, columns=variants).sort_index()
            quality.index.name = 'cell_barcode'
            quality = quality.drop(index=no_abs)
            with loompy.connect(self.filename) as ds:
                ds.layers['quality'] = np.array(quality).astype(int)
            #adata.layers['quality'] = np.array(total_depth).astype(int)
            del quality
            
            total_depth = pd.DataFrame(np.transpose(f['DP']), index=cell_barcodes, columns=variants).sort_index()
            total_depth.index.name = 'cell_barcode'
            total_depth = total_depth.drop(index=no_abs)
            with loompy.connect(self.filename) as ds:
                ds.layers['total_depth'] = np.array(total_depth).astype(int)
            #adata.layers['total_depth'] = np.array(total_depth).astype(int)
            del total_depth
            
            alt_depth = pd.DataFrame(np.transpose(f['AD']), index=cell_barcodes, columns=variants).sort_index()
            alt_depth.index.name = 'cell_barcode'
            alt_depth = alt_depth.drop(index=no_abs)
            with loompy.connect(self.filename) as ds:
                ds.layers['alt_depth'] = np.array(alt_depth).astype(int)
            #adata.layers['alt_depth'] = np.array(alt_depth).astype(int)
            del alt_depth
            
            #adata.write(os.path.join(self.path['out_folder'], self.filename))
            with loompy.connect(self.filename) as ds:
                self.harmonize_Abs(ds)
            #self.genotypes = adata
            # calculate vaf - nan for division by 0
            #vaf = np.divide(alt_depth, total_depth)
        return
Esempio n. 25
0
    splitUp = hgncEnsmbl.loc[i, 'Previous symbols'].split(', ')
    ensmbl = hgncEnsmbl.loc[i, 'Ensembl ID(supplied by Ensembl)']
    for j in splitUp:
        hgncPrevEnsmbl[j] = ensmbl

hgncAliasEnsmbl = {}
for i in hgncEnsmbl.loc[~hgncEnsmbl['Alias symbols'].isnull()].index:
    splitUp = hgncEnsmbl.loc[i, 'Alias symbols'].split(', ')
    ensmbl = hgncEnsmbl.loc[i, 'Ensembl ID(supplied by Ensembl)']
    for j in splitUp:
        hgncAliasEnsmbl[j] = ensmbl

## 3. Load up data
# Load up loom file
print('\nLoading WT data...')
adata1 = sc.read_loom('data/WT.loom')

## 4. Convert to Ensembl gene IDs
# Convert from gene symbols to Ensmbl gene IDs
adata1 = adata1[:, [
    True if i in hgncEnsmbl.index or i in hgncPrevEnsmbl or i in
    hgncAliasEnsmbl else False for i in adata1.var_names
]]
convertMe_hgnc = pd.DataFrame(np.nan,
                              index=adata1.var_names,
                              columns=['ensembl'])
numConverted_hgnc = 0
missed = []
for j in adata1.var_names:
    if j in hgncEnsmbl.index:
        convertMe_hgnc.loc[j, 'ensembl'] = hgncEnsmbl.loc[
Esempio n. 26
0
def HDS(
        path1=None,
        clusters=None,
        genes=None,
        per=.1,
        # pv=0.025, co=.9, r_c=0,
        min_genes=200,
        min_cells=3,
        n_genes_by_counts=2500,
        pct_counts_mt=5,
        resolution=1):
    # path variables
    path = {}
    path['path'] = path1
    root = os.getcwd() + '/' + 'loom_data'
    if not os.path.exists(os.getcwd() + '/' + 'loom_data'):
        os.mkdir(root)
    adata = scp.read_loom(path['path'], sparse=False, X_name='spliced')
    if not clusters:
        adata = clustering(adata, min_genes, min_cells, n_genes_by_counts,
                           pct_counts_mt, resolution)
    else:
        adata.obs['leiden'] = clusters
    path["loom"] = root + "/temp.loom"
    path["metadata"] = root + "/metadata.csv"
    path["sigi"] = root + "/significant_restoration_genes_across_all_cells.csv"
    path["sigd"] = root + "/significant_hb_genes_across_all_cells.csv"
    path['r2'] = root + "/r2_all_cells.csv"
    path[
        "velocity_age_0p025"] = root + "/Velocity_0p025_age_specific_gamma.csv"
    # plots save location
    path['save'] = root + "/"
    # fitting gamma
    vlm = fitting_gamma(adata, path)
    # compute r-squared leiden wise
    leiden = np.unique(vlm.ca['leiden'])
    r2 = r2_feature_wise(path, vlm, feature='leiden', m="r2")
    # mis = r2_feature_wise(path, vlm, feature='leiden',m="mis")
    violin_plotting(path, r2, per=per)
    # getting_cor(r2,pv=0.025,co=.9)
    # r2_cut(path, r_c, r2)
    # count_genes()
    # sigi = pd.read_csv(path["sigi"], index_col=0)
    # sigd = pd.read_csv(path["sigd"], index_col=0)
    if genes:
        genes = np.intersect1d(genes, r2.index)
        if len(genes) > 0:
            plot_genes_r2_phase_portrait_supp_hb(path,
                                                 vlm,
                                                 r2,
                                                 genes,
                                                 figname=True)
        else:
            print("no common gene found!")
    else:
        genes = r2.index[:5]
        plot_genes_r2_phase_portrait_supp_hb(path,
                                             vlm,
                                             r2,
                                             genes,
                                             figname=True)
        """genes=sigi.index[:5]
      plot_genes_r2_phase_portrait_supp_hb(path, vlm, sigi, genes, figname=True)
      genes=sigd.index[:5]
      plot_genes_r2_phase_portrait_supp_hb(path, vlm, sigd, genes, figname=True)"""
    return r2
Esempio n. 27
0
#------------------------------------

import numpy as np
import matplotlib.pyplot as pl
from matplotlib.pyplot import plot, savefig
import scanpy as sc

# fig setting
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
results_file = './testnoOLCs.h5ad'
sc.settings.set_figure_params(dpi=300)

# loading the dataset
adata = sc.read_loom(
    'data/DataSetA.neurogenicSubset.seurat.pca10.umap.tsne.annotation.noOLCs.loom'
)

# pre-process, diffmap and trajectory inference
adata.uns['iroot'] = np.flatnonzero(adata.obs['ClusterName'] == 'qNSCs')[10]
sc.pp.neighbors(adata, n_neighbors=20, use_rep='X', method='gauss')
sc.tl.diffmap(adata, n_comps=10)
sc.tl.dpt(adata, n_branchings=0, n_dcs=10)
#adata = read_h5ad('./test.h5ad')

adata.obsm['tsne'] = adata.obsm['tsne_cell_embeddings']
fig, ax = pl.subplots(figsize=(12, 12))
axs = sc.pl.tsne(adata, color=['dpt_pseudotime', 'ClusterName'], show=False)
pl.show()
pl.savefig('./testnoOLCs.pdf')
adata.write(results_file)
import scvelo as scv
import scanpy as sc


# read adata object
adata = sc.read_loom('path_to_file')

# Normalize and calculate the moments
scv.pp.normalize_per_cell(adata)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)

#Calculate the velocities
scv.tl.velocity(adata)


# Here you could first filter cells.
# So they dont get included in the velocity graph
scv.tl.velocity_graph(adata)

#Command in scvelo and scanpy both work on the same object
# so we can switch between the two modules

# calculate tsne, with optional arguments
sc.tl.tsne(adata)
scv.tl.tsne(adata,perplexity=50, early_exaggeration=100,learning_rate=100,random_state=108, n_jobs=12)

# write tsne to file.
scv.pl.tsne(adata, color='Stimulation', save='path_to_save_figure')
scv.pl.tsne(adata, color='Batch', save='path_to_save_figure')

#calculate the velocity embedding by project velocities on the tsne embedding
Esempio n. 29
0
#figure_file = open('figures%s.png' % i, 'w')

sc.settings.set_figure_params(dpi=80)

#adata = sc.read_loom('PC9Combined.loom', var_names = 'gene_symbols', sparse = True, cleanup = False, X_name = 'spliced', obs_names = 'CellID')

#day00a = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D0.1500.dge", var_names = 'gene_symbols', delimiter = "\t")
#day01 = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D1.txt.500.dge", var_names = 'gene_symbols', delimiter = "\t")
#day02 = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D2.txt.500.dge", var_names = 'gene_symbols', delimiter = "\t")
#day04 = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D4.txt.500.dge", var_names = 'gene_symbols', delimiter = "\t")
#day09 = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D9.txt.500.dge", var_names = 'gene_symbols', delimiter = "\t")
#day11a = sc.read_text("/home/alex/lab/drugres/DropSeqFiles/D11.txt.500.dge", var_names = 'gene_symbols', delimiter = "\t")

day00 = sc.read_loom('Day0.loom',
                     var_names='gene_symbols',
                     sparse=True,
                     cleanup=False,
                     X_name='spliced',
                     obs_names='CellID')
day01 = sc.read_loom('Day01.loom',
                     var_names='gene_symbols',
                     sparse=True,
                     cleanup=False,
                     X_name='spliced',
                     obs_names='CellID')
day02 = sc.read_loom('Day02.loom',
                     var_names='gene_symbols',
                     sparse=True,
                     cleanup=False,
                     X_name='spliced',
                     obs_names='CellID')
day04 = sc.read_loom('Day04.loom',
##############################################################
## Definition of Auxilliary functions

## Subsetting adata by top HVGs
def subset_hvg(adata, n_top):
    sc.pp.normalize_total(adata, target_sum=10000)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, n_top_genes=n_top)
    adata = adata[:, adata.var['highly_variable']]
    return(adata)


#############################################################

## Reading reference and query datasets
annData = sc.read_loom(snakemake.input.reference_loom)
target_annData = sc.read_loom(snakemake.input.query_loom)


#############################################################
##							   ##
## 	Harmonizing reference and query datasets	   ##
##							   ##
#############################################################  

## Subsetting annotations to cell type and batches
annData.obs = annData.obs[
                 [snakemake.params.ref_cell_type_column, 
                 snakemake.params.ref_batch_column]
]