Ejemplo n.º 1
0
def test_read_10x_h5():
    sc.read_10x_h5(os.path.join(ROOT, '1.2.0',
                                'filtered_gene_bc_matrices_h5.h5'),
                   genome='hg19_chr21')
    sc.read_10x_h5(os.path.join(ROOT, '3.0.0',
                                'filtered_feature_bc_matrix.h5'),
                   genome='GRCh38_chr21')
Ejemplo n.º 2
0
def tsne_cluster(path, njobs):
    adata = sc.read_10x_h5(path)
    sc.pp.recipe_zheng17(adata)
    sc.tl.tsne(adata, n_jobs=n_jobs)
    sc.tl.louvain(adata)
    sc.pl.tsne(adata, color='louvain_groups', save=True, right_margin=2)
    adata.write('one_million.h5ad')
Ejemplo n.º 3
0
    def read_raw_file(self):
        """
        Reads the raw data file and turns it into a dense matrix, stored as
        an attribute.

        Returns
        -------

        """

        print("reading single cell data from {}".format(self.raw_file))

        file_format = self.raw_file.split('.')[-1]

        if file_format == 'h5':
            andata = sc.read_10x_h5(self.raw_file)
        elif file_format == 'h5ad':
            andata = sc.read(self.raw_file)
        else:
            raise ValueError('Reading [ %s ] failed, the inferred file '
                             'format [ %s ] is not supported. Please convert '
                             'your file to either h5 or h5ad format.' %
                             (self.raw_file, file_format))

        # appends -1 -2... to the name of genes that already exist
        andata.var_names_make_unique()
        if sp_sparse.issparse(andata.X):
            andata.X = andata.X.toarray()

        self.sc_raw = andata
Ejemplo n.º 4
0
def convert_from_10xh5(path, genome):
    adata = sc.read_10x_h5(path, genome)
    adata.var_names_make_unique()
    df = pandas.DataFrame(adata.X.todense(),
                          index=adata.obs_names,
                          columns=adata.var_names)
    yield df
Ejemplo n.º 5
0
def adataToTrainState(filename, refname, num_genes=8000):
    adata = sc.read_10x_h5(filename, refname)
    adata.name = filename.split(os.sep)[-3]
    sc.tl.addCleanObsNames(adata)
    sc.pp.filter_cells(adata, min_genes=10)
    mito_genes = [
        name for name in adata.var_names if name in [
            'ND1', 'ND2', 'ND4L', 'ND4', 'ND5', 'ND6', 'ATP6', 'ATP8', 'CYTB',
            'COX1', 'COX2', 'COX3'
        ]
    ]
    #mito_genes = [name for name in adata.var_names if name.startswith(('MTND','MTCO','MTATP','MTCYB','MTRNR','MTTR',))]
    ribo_genes = [
        name for name in adata.var_names
        if name.startswith('RPS') or name.startswith('RPL')
    ]
    adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X,
                                       axis=1).A1 / np.sum(adata.X, axis=1).A1
    adata.obs['percent_ribo'] = np.sum(adata[:, ribo_genes].X,
                                       axis=1).A1 / np.sum(adata.X, axis=1).A1
    # add the total counts per cell as observations-annotation to adata
    adata.obs['n_counts'] = adata.X.sum(axis=1).A1
    scanpy.api.pp.filter_genes_dispersion(
        adata, n_top_genes=min(np.sum(np.sum(adata.X, axis=0) > 0), num_genes))
    return (adata)
Ejemplo n.º 6
0
def adataToTrainStateAggregate(fileList, refname, num_genes=3000):
    adatas = []
    batch_categories = []
    for filename in fileList:
        a = sc.read_10x_h5(filename, refname)
        sc.pp.filter_cells(a, min_genes=30)
        a.name = filename.split(os.sep)[-3]
        a.obs['dataset'] = a.name
        batch_categories.append(a.name)
        adatas.append(a)
    adata = sc.AnnData.concatenate(*adatas, batch_categories=batch_categories)
    mito_genes = [
        name for name in adata.var_names if name in [
            'ND1', 'ND2', 'ND4L', 'ND4', 'ND5', 'ND6', 'ATP6', 'ATP8', 'CYTB',
            'COX1', 'COX2', 'COX3'
        ]
    ]
    #mito_genes = [name for name in adata.var_names if name.startswith(('MTND','MTCO','MTATP','MTCYB','MTRNR','MTTR',))]
    ribo_genes = [
        name for name in adata.var_names
        if name.startswith('RPS') or name.startswith('RPL')
    ]
    adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X,
                                       axis=1).A1 / np.sum(adata.X, axis=1).A1
    adata.obs['percent_ribo'] = np.sum(adata[:, ribo_genes].X,
                                       axis=1).A1 / np.sum(adata.X, axis=1).A1
    # add the total counts per cell as observations-annotation to adata
    adata.obs['n_counts'] = adata.X.sum(axis=1).A1
    scanpy.api.pp.filter_genes_dispersion(
        adata, n_top_genes=min(np.sum(np.sum(adata.X, axis=0) > 0), num_genes))
    return (adata[adata.obs.n_counts.argsort()[::-1], :])
Ejemplo n.º 7
0
def load_10x_1_3mil():
    """
    https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.3.0/1M_neurons
    """
    filename_data='/data/martin/single_cell/10x_1.3mil_mice_brain/' +\
                  '1M_neurons_filtered_gene_bc_matrices_h5.h5'
    data = sc.read_10x_h5(filename_data)
    return data
Ejemplo n.º 8
0
def convert(
        input,
        output,
        chunk_size=16 * 1024 * 1024,
        genome=None,
        overwrite=False
):
    input_path, input_ext = splitext(input)
    output_path, output_ext = splitext(output)

    print('converting: %s to %s' % (input, output))

    if input_ext == '.h5' or input_ext == '.loom':
        if output_ext == '.zarr':
            # Convert 10x (HDF5) to Zarr
            source = h5py.File(input)
            zarr.tree(source)

            store = zarr.DirectoryStore(output)
            dest = zarr.group(store=store, overwrite=overwrite)

            # following fails if without_attrs=False (the default), possibly related to https://github.com/h5py/h5py/issues/973
            zarr.copy_all(source, dest, log=sys.stdout, without_attrs=True)
            zarr.tree(dest)
        elif output_ext == '.h5ad':
            if not genome:
                keys = list(h5py.File(input).keys())
                if len(keys) == 1:
                    genome = keys[0]
                else:
                    raise Exception(
                        'Set --genome flag when converting from 10x HDF5 (.h5) to Anndata HDF5 (.h5ad); top-level groups in file %s: %s'
                        % (input, ','.join(keys))
                    )
            adata = read_10x_h5(input, genome=genome)

            # TODO: respect overwrite flag
            adata.write(output)

    elif input_ext == '.h5ad':
        adata = read_h5ad(input, backed='r')
        (r, c) = adata.shape
        chunks = (getsize(input) - 1) / chunk_size + 1
        chunk_size = (r - 1) / chunks + 1
        if output_ext == '.zarr':
            print('converting %s (%dx%d) to %s in %d chunks (%d rows each)' % (input, r, c, output, chunks, chunk_size))

            # TODO: respect overwrite flag

            adata.write_zarr(
                make_store(output),
                chunks=(chunk_size, c)
            )
        else:
            raise Exception('Unrecognized output extension: %s' % output_ext)
    else:
        raise Exception('Unrecognized input extension: %s' % input_ext)
Ejemplo n.º 9
0
def basic_analysis(filename):
    adata = sc.read_10x_h5(filename)
    sc.pp.subsample(adata, fraction=0.1)
    sc.pp.recipe_zheng17(adata)
    sc.pp.neighbors(adata, method='rapids')
    sc.tl.louvain(adata)
    sc.tl.umap(adata)
    sc.tl.rank_genes_groups(adata, 'louvain')
    adata.write('./write/result_130K_gpu_neighbors.h5ad')
    # plotting
    sc.pl.umap(adata, color='louvain', save='_130K_gpu_neighbors.png')
    sc.pl.rank_genes_groups(adata, save='_130K_gpu_neighbors.pdf')
def basic_analysis(filename):
    adata = sc.read_10x_h5(filename)
    sc.pp.subsample(adata, fraction=0.1)
    sc.pp.recipe_zheng17(adata)
    sc.pp.neighbors(adata, method="rapids")
    sc.tl.louvain(adata)
    sc.tl.umap(adata)
    sc.tl.rank_genes_groups(adata, "louvain")
    adata.write("./write/result_130K_gpu_neighbors.h5ad")
    # plotting
    sc.pl.umap(adata, color="louvain", save="_130K_gpu_neighbors.png")
    sc.pl.rank_genes_groups(adata, save="_130K_gpu_neighbors.pdf")
Ejemplo n.º 11
0
def basic_analysis(filename):
    adata = sc.read_10x_h5(filename)
    sc.pp.recipe_zheng17(adata)
    sc.pp.neighbors(adata)
    sc.tl.louvain(adata)
    sc.tl.paga(adata)
    sc.tl.umap(adata)
    sc.tl.rank_genes_groups(adata, "louvain")
    adata.write("./write/result.h5ad")
    # plotting
    sc.pl.paga(adata)
    sc.pl.umap(adata, color="louvain")
    sc.pl.rank_genes_groups(adata, save=".pdf")
Ejemplo n.º 12
0
 def get_genes(self, sce):
     _transcripts = sce.rowData["hgnc_symbol"]
     try:
         adata = sc.read_10x_h5(self.filtered_h5(), genome=config.build)
     except Exception:
         adata = sc.read_10x_mtx(self.filtered_matrices())
     transcripts = []
     for symbol in transcripts:
         if symbol not in adata.var.index:
             symbol = symbol.replace(".", "-")
             if symbol not in adata.var.index:
                 symbol = symbol.split("-")
                 symbol = "-".join(symbol[:-1]) + ".{}".format(symbol[-1])
                 if symbol not in adata.var.index:
                     symbol = symbol.split(".")[0]
         transcripts.append(symbol)
     return _transcripts
Ejemplo n.º 13
0
def read_anndata(input, genome=None):
    _, input_ext = splitext(input)
    if input_ext == ".h5":
        if not genome:
            keys = list(File(input, "r").keys())
            if len(keys) == 1:
                genome = keys[0]
            else:
                raise Exception(
                    "Set --genome flag when converting from 10x HDF5 (.h5) to Anndata HDF5 (.h5ad); top-level groups in file %s: %s"
                    % (input, ",".join(keys)))
        return read_10x_h5(input, genome=genome)
    elif input_ext == ".h5ad":
        return read_h5ad(input)
    elif input_ext == ".loom":
        # reads the whole dataset in memory!
        return read_loom(input)
    else:
        raise Exception("Unrecognized input extension: %s" % input_ext)
Ejemplo n.º 14
0
 def gene_map(self, sce, original=False):
     _transcripts = sce.rowData["Symbol"]
     try:
         adata = sc.read_10x_h5(self.filtered_h5(), genome=config.build)
     except Exception:
         adata = sc.read_10x_mtx(self.filtered_matrices())
     transcripts = {}
     for symbol in _transcripts:
         original = symbol
         if symbol not in adata.var.index:
             symbol = symbol.replace(".", "-")
             if symbol not in adata.var.index:
                 symbol = symbol.split("-")
                 symbol = "-".join(symbol[:-1]) + ".{}".format(symbol[-1])
                 if symbol not in adata.var.index:
                     symbol = symbol.split(".")[0]
         if original:
             transcripts[original] = symbol
         else:
             transcripts[symbol] = original
     return transcripts
Ejemplo n.º 15
0
#!/usr/bin/env python3

import scanpy.api as sc
# Read 10x dataset
adata = sc.read_10x_h5("neuron_10k_v3_filtered_feature_bc_matrix.h5")
# Make variable names (in this case the genes) unique
adata.var_names_make_unique()

sc.tl.pca(adata)
sc.pl.pca(adata)
sc.pp.filter_genes(adata, min_counts=1)
sc.pp.normalize_per_cell(adata, key_n_counts='n_counts_all')

filter_result = sc.pp.filter_genes_dispersion(adata.X,
                                              flavor='cell_ranger',
                                              n_top_genes=1000,
                                              log=False)

adata = adata[:, filter_result.gene_subset]
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)
sc.pp.scale(adata)
sc.tl.pca(adata, n_comps=50)
sc.pl.pca(adata)
Ejemplo n.º 16
0
def getAnnData_10x_h5(input_file):
	adata = sc.read_10x_h5(input_file,"GRCh38")
	return adata
Ejemplo n.º 17
0
# coding: utf-8

# In[1]:

import pandas as pd
import scanpy.api as sc

# In[ ]:

ica_cord_blood = sc.read_10x_h5(
    glob.glob(
        '/projects/sysbio/users/cellAtlas/data/human/immune_census/ica_cord_blood_h5.h5'
    )[0], "GRCh38")
ica_cord_blood.write(
    '/projects/sysbio/users/cellAtlas/scanpyObjects/ica_cord_blood.h5ad')

# In[ ]:

ica_bone_marrow = sc.read_10x_h5(
    glob.glob(
        '/projects/sysbio/users/cellAtlas/data/human/immune_census/ica_bone_marrow_h5.h5'
    )[0], "GRCh38")
ica_bone_marrow.write(
    '/projects/sysbio/users/cellAtlas/scanpyObjects/ica_bone_marrow.h5ad')
Ejemplo n.º 18
0
import numpy as np
import pickle

import sys
sys.path.append('/gpfs01/berens/user/dkobak/FIt-SNE')
from fast_tsne import fast_tsne


# LOAD AND PREPROCESS THE DATA

import scanpy.api as sc
sc.settings.verbosity = 2

# Data file is from here 
# https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.3.0/1M_neurons
adata = sc.read_10x_h5('big-data/10x/1M_neurons_filtered_gene_bc_matrices_h5.h5')
sc.pp.recipe_zheng17(adata)

X = np.copy(adata.X)
X = X - X.mean(axis=0)
U, s, V = np.linalg.svd(X, full_matrices=False)
U[:, np.sum(V,axis=1)<0] *= -1
X = np.dot(U, np.diag(s))
X = X[:, np.argsort(s)[::-1]][:,:50]
pickle.dump(X, open('big-pickles/10x-pca.pickle', 'wb'))

# load cluster labels
# https://github.com/theislab/scanpy_usage/blob/master/170522_visualizing_one_million_cells/results/louvain.csv.gz
clusters = pd.read_csv('data/10x-1mln-scanpy-louvain.csv.gz', header=None).values[:,1].astype(int)

import matplotlib
matplotlib.use('agg') # plotting backend compatible with screen
import scanpy.api as sc
import pandas as pd
import numpy as np
import logging
import os
import velocyto as vcy

logging.basicConfig(level=logging.INFO)


logging.info("Start")

sc.settings.verbosity = 2
sc.settings.autosave = True # save figures, do not show them
sc.settings.set_figure_params(dpi=300) # set sufficiently high resolution for saving

inputfile = os.path.expanduser('/ye/yelabstore2/mtschmitz/seq/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/outs')
# Run louvain clustering on true gene expression values
velocityFile = os.path.expanduser('/ye/yelabstore2/mtschmitz/seq/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/velocyto/orangutanorganoid_Out.loom')

adata = sc.read_10x_h5('/home/mt/code/data/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/outs/filtered_gene_bc_matrices_h5.h5','refdata-celranger-Pabe2-toplevel')
adata=sc.tl.rna_velocity(adata,os.path.expanduser('~/code/data/AlignedOrangutanOrganoid/Exonic/orangutanorganoid_Out/velocyto/orangutanorganoid_Out.loom'))
Ejemplo n.º 20
0
import pandas as pd
import h5py
import scanpy.api as sc
import collections
import scipy.sparse as sp_sparse
import numpy as np
import tables

# In[169]:

# pick one data set here, but run code through for both
pick_one_dataset = "ica_cord_blood"
pick_one_dataset = "ica_bone_marrow"

scanpy_object = sc.read_10x_h5(
    glob.glob(
        '/projects/sysbio/users/cellAtlas/data/primary/human/immune_census/' +
        pick_one_dataset + '_h5.h5')[0], "GRCh38")

print(scanpy_object.shape)

# # Explore the sample IDs

# In[170]:

# Donor ID is encoded in sample IDs
donor_id = [
    y.split("Manton")[1]
    for y in [x.split("_")[0] for x in scanpy_object.obs.index]
]
#print(set(donor_id))
Ejemplo n.º 21
0
import numpy as np
import pickle

import sys
sys.path.append('/gpfs01/berens/user/dkobak/FIt-SNE')
from fast_tsne import fast_tsne


# LOAD AND PREPROCESS THE DATA

import scanpy.api as sc
sc.settings.verbosity = 2

adata = sc.read_10x_h5('1M_neurons_filtered_gene_bc_matrices_h5.h5')
sc.pp.recipe_zheng17(adata)

X = np.copy(adata.X)
X = X - X.mean(axis=0)
U, s, V = np.linalg.svd(X, full_matrices=False)
U[:, np.sum(V,axis=1)<0] *= -1
X = np.dot(U, np.diag(s))
X = X[:, np.argsort(s)[::-1]][:,:50]
pickle.dump(X, open('pca-scanpy.pickle', 'wb'))


# CLUSTERING

import scanpy.api as sc
sc.settings.verbosity = 2
adata = sc.read_10x_h5('1M_neurons_filtered_gene_bc_matrices_h5.h5') 
sc.pp.recipe_zheng17(adata) 
Ejemplo n.º 22
0
#!/Users/cmdb/miniconda3/bin/python 	

import scanpy.api as sc
import sys
import matplotlib
sc.settings.autoshow = False




adata = sc.read_10x_h5(sys.argv[1])
adata.var_names_make_unique()

sc.tl.pca(adata, svd_solver='auto')
sc.pl.pca(adata, save="PCA_adata.png", title="PCA Before Zheng Pre-processing")
# sc.tl.pca(adata, n_comps=50)

# var_names1 = adata.var_names
# for i in var_names1:
# 	print(adata.var_names[i])
	# if adata.var_names[str(var_names1[i])]:
	# 	print(adata.var_names[i])

# sc.pl.pca(adata, save="PCA_adata.png")

# sc.tl.tsne(adata)
# sc.pl.tsne(adata, save="tSNE_adata.png")

# sc.pp.neighbors(adata)
# sc.tl.umap(adata)
# sc.pl.umap(adata, save="uMAP_adata.png")
Ejemplo n.º 23
0
  filename = "home/1M_neurons_matrix_subsampled_25K.h5"
elif len(sys.argv) > 1:
  # default output file
  filename = sys.argv[1]
  if len(sys.argv) == 3:
    verbose = int(sys.argv[2])


start = datetime.datetime.now()

sc.settings.verbosity = verbose             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

print("Reading, making names unique and filtering:")
now = datetime.datetime.now()
adata = sc.read_10x_h5(filename)

adata.var_names_make_unique()  # this is unnecessary if using 'gene_ids'

sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
print("Reading, making names unique and filtering Time:", datetime.datetime.now() - now)

print("Log Normalizing Data")
now = datetime.datetime.now()
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e3)
sc.pp.log1p(adata)
print("Log Normalizing Data Time:", datetime.datetime.now() - now)

adata.raw = adata
print("Finding Variable Features")
Ejemplo n.º 24
0
from scio import concatenate, check_obs, check_var

DATASET = "vanderburg_01"
OBS_DATA = "tables/{}_samples.csv".format(DATASET)
OUTPUT_DIR = "./"

obs = pd.read_csv(OBS_DATA)

dataset_samples = obs["samples"].values

filenames = [
    "data/cellranger/{}_GEX/outs/raw_feature_bc_matrix.h5".format(sample[1:])
    for sample in dataset_samples
]

adatas = [sc.read_10x_h5(filename, genome="GRCh38") for filename in filenames]

adatas2 = []
for adata, sample in zip(adatas, dataset_samples):
    duplicated = adata.var_names.duplicated()
    print("Removing {} gene symbols because they are duplicated".format(
        sum(duplicated)))
    adata = adata[:, ~duplicated].copy()
    adata.obs['samples'] = sample
    adatas2.append(adata)

adata = concatenate(adatas2, merge_var_cols=["gene_ids"])
adata.obs = adata.obs.join(obs.set_index("samples"), on="samples", how="left")

adata.obs["dataset"] = DATASET