Ejemplo n.º 1
0
 def __init__(self,
              dataname,
              save_path='/data/muris_tabula/',
              tissue='Marrow'):
     self.save_path = save_path
     self.dataname = dataname
     self.tissue = tissue
     self.urls = [
         'https://github.com/czbiohub/tabula-muris-vignettes/raw/master/data/TM_droplet_metadata.csv',
         'https://github.com/czbiohub/tabula-muris-vignettes/raw/master/data/TM_facs_metadata.csv',
         'https://s3.amazonaws.com/czbiohub-tabula-muris/TM_droplet_mat.h5ad',
         'https://s3.amazonaws.com/czbiohub-tabula-muris/TM_facs_mat.h5ad'
     ]
     self.download_names = [
         'TM_droplet_metadata.csv', 'TM_facs_metadata.csv',
         'TM_droplet_mat.h5ad', 'TM_facs_mat.h5ad'
     ]
     self.download()
     count, labels, cell_type, gene_names = self.preprocess()
     count = csr_matrix(count.astype('int'))
     super(TabulaMuris, self).__init__(
         *GeneExpressionDataset.get_attributes_from_matrix(count,
                                                           labels=labels),
         gene_names=np.char.upper(gene_names),
         cell_types=cell_type)
Ejemplo n.º 2
0
 def __init__(self, save_path='../Paul/'):
     self.save_path = save_path
     count, labels, cell_type, gene_names = self.preprocess()
     super(Paul, self).__init__(
         *GeneExpressionDataset.get_attributes_from_matrix(count,
                                                           labels=labels),
         gene_names=np.char.upper(gene_names),
         cell_types=cell_type)
Ejemplo n.º 3
0
 def __init__(self, save_path='/data/yosef2/scratch/chenling/scanvi_data/'):
     self.save_path = save_path
     count, labels, cell_type, gene_names= self.preprocess()
     labels = labels.astype('int')
     super(DentateGyrus10X, self).__init__(
         *GeneExpressionDataset.get_attributes_from_matrix(
             count, labels=labels),
         gene_names=np.char.upper(gene_names), cell_types=cell_type)
Ejemplo n.º 4
0
 def __init__(self, save_path='../Tusi/'):
     self.save_path = save_path
     count, labels, cell_type, gene_names, time, diff_axis, batchid = self.preprocess(
     )
     super(Tusi, self).__init__(
         *GeneExpressionDataset.get_attributes_from_matrix(count,
                                                           labels=labels),
         gene_names=np.char.upper(gene_names),
         cell_types=cell_type)
     self.batch_indices = batchid.reshape(len(batchid), 1)
     self.time_traj = time
     self.diff_axis = diff_axis
Ejemplo n.º 5
0
 def __init__(self, save_path='/data/yosef2/scratch/chenling/scanvi_data/',coarse=True):
     self.save_path = save_path
     count, labels, cell_type, gene_names,labels_groups,groups = self.preprocess()
     labels = labels.astype('int')
     if coarse==True:
         labels = labels_groups[labels]
         cell_type = groups
     super(ZeiselMoleArchData, self).__init__(
         *GeneExpressionDataset.get_attributes_from_matrix(
             count, labels=labels),
         gene_names=np.char.upper(gene_names), cell_types=cell_type)
     if coarse==False:
         self.labels_groups = labels_groups
         self.groups = groups
Ejemplo n.º 6
0
 def __init__(self, save_path='/data/yosef2/scratch/chenling/scanvi_data/cortex1/',coarse=True):
     self.save_path = save_path
     count, labels, cell_type, gene_names,labels_groups,groups,batch = self.preprocess()
     labels = labels.astype('int')
     batch_names,batch_indices = np.unique(batch,return_inverse=True)
     if coarse==True:
         labels = labels_groups[labels]
         cell_type = groups
     super(ZeiselCortexOnly, self).__init__(
         *GeneExpressionDataset.get_attributes_from_matrix(
             count, labels=labels),
         gene_names=np.char.upper(gene_names), cell_types=cell_type)
     if coarse==False:
         self.labels_groups = labels_groups
         self.groups = groups
     self.batch_indices = batch_indices.reshape(len(batch_indices),1)
     self.batch_names = batch_names
Ejemplo n.º 7
0
 def __init__(self, save_path='../AIBS/', coarse=True):
     self.save_path = save_path
     count, gene_names, labels, cell_type, labels_groups, groups = self.preprocess(
     )
     assert len(labels_groups) == len(cell_type)
     labels = labels.astype('int')
     if coarse == True:
         new_labels_dict = dict(
             zip(*[np.unique(labels), groups[labels_groups]]))
         labels = np.asarray([new_labels_dict[x] for x in labels])
         groups, labels = np.unique(labels, return_inverse=True)
         cell_type = groups
     super(Zeng10X, self).__init__(
         *GeneExpressionDataset.get_attributes_from_matrix(count,
                                                           labels=labels),
         gene_names=np.char.upper(gene_names),
         cell_types=cell_type)
     self.labels_groups = labels_groups
     self.groups = groups
Ejemplo n.º 8
0
 def __init__(self, save_path='../AIBS/', coarse=True):
     self.save_path = save_path
     count, labels, cell_type, gene_names, labels_groups = self.preprocess()
     labels = labels.astype('int')
     if coarse == True:
         groups = [
             'Pvalb', 'L2/3', 'Sst', 'L5 PT', 'L5 IT Tcap', 'L5 IT Aldh1a7',
             'L5 IT Foxp2', 'L5 NP', 'L6 IT', 'L6 CT', 'L6 NP', 'L6b',
             'Lamp5', 'Vip', 'Astro', 'OPC', 'VLMC', 'Oligo', 'Sncg',
             'Endo', 'SMC', 'MICRO'
         ]
         groups = np.asarray([x.upper() for x in groups])
         cell_type_bygroup = np.concatenate(
             [[x for x in cell_type if x.startswith(y)] for y in groups])
         new_labels_dict = dict(
             zip(cell_type_bygroup, np.arange(len(cell_type_bygroup))))
         labels = np.asarray([cell_type[x] for x in labels])
         new_labels = np.asarray([new_labels_dict[x] for x in labels])
         labels_groups = [[
             i for i, x in enumerate(groups) if y.startswith(x)
         ][0] for y in cell_type_bygroup]
         coarse_labels_dict = dict(
             zip(np.arange(len(labels_groups)), labels_groups))
         coarse_labels = np.asarray(
             [coarse_labels_dict[x] for x in new_labels]).astype('int')
         groups = groups[np.unique(coarse_labels)]
         mapping = dict(
             zip(np.unique(coarse_labels),
                 np.arange(len(np.unique(coarse_labels)))))
         coarse_labels = np.asarray([mapping[x] for x in coarse_labels])
         cell_type = groups
         labels = coarse_labels
     super(MacoskoDataset, self).__init__(
         *GeneExpressionDataset.get_attributes_from_matrix(count,
                                                           labels=labels),
         gene_names=np.char.upper(gene_names),
         cell_types=cell_type)
     self.labels_groups = labels_groups
Ejemplo n.º 9
0
try:
    COMPONENTS = int(snakemake.params['components'])
except AttributeError:
    COMPONENTS = 10  # Latent component count

try:
    LAYERS = int(snakemake.params['layers'])
except AttributeError:
    LAYERS = 1  # number of hidden layers

RECONSTRUCTION_LOSS = "nb"

batch = batch.reshape((-1, 1)).astype('int64')
cvals = counts.values.astype('int64')
zz = GeneExpressionDataset.get_attributes_from_matrix(cvals,
                                                      batch_indices=batch)

# zz[0]: int64, ndarray, genes x cells
# zz[1]: float32, ndarray, cells x 1
# zz[2]: float32, ndarray, cells x 1
# zz[3]: int64, ndarray, cells x 1

dataset = GeneExpressionDataset(*zz, gene_names=counts.columns)

n_epochs = 400
lr = 1e-3
use_batches = True
use_cuda = torch.cuda.is_available()

torch.set_num_threads(20)
# However, need to set MKL_NUM_THREADS too
Ejemplo n.º 10
0
model_type = str(sys.argv[1])
plotname = 'simulation.EVF'

count = np.load('../sim_data/Sim_EVFbatch.UMI.npy')
count = count.T
meta = np.load('../sim_data/Sim_EVFbatch.meta.npy')

count_1 = count[meta[:, 2] == 0, :]
labels_1 = meta[meta[:, 2] == 0, 1]

count_2 = count[meta[:, 2] == 1, :]
labels_2 = meta[meta[:, 2] == 1, 1]

dataset1 = GeneExpressionDataset(
    *GeneExpressionDataset.get_attributes_from_matrix(csr_matrix(count_1),
                                                      labels=labels_1),
    gene_names=['gene' + str(i) for i in range(2000)],
    cell_types=['type' + str(i + 1) for i in range(5)])

dataset2 = GeneExpressionDataset(
    *GeneExpressionDataset.get_attributes_from_matrix(csr_matrix(count_2),
                                                      labels=labels_2),
    gene_names=['gene' + str(i) for i in range(2000)],
    cell_types=['type' + str(i + 1) for i in range(5)])

gene_dataset = GeneExpressionDataset.concat_datasets(dataset1, dataset2)

if model_type in ['vae', 'svaec', 'Seurat', 'Combat']:
    if model_type == 'vae':
        vae = VAE(gene_dataset.nb_genes,
                  n_batch=gene_dataset.n_batches,
Ejemplo n.º 11
0
from scvi.dataset.dataset import GeneExpressionDataset
from scvi.harmonization.benchmark import knn_purity_avg
from scvi.inference import *
from scvi.models.scanvi import SCANVI
from scvi.models.vae import VAE

model_type = 'svaec'  # str(sys.argv[1])
plotname = 'EVFbatch_simulation'

countUMI = np.load('../sim_data/count.UMI.npy').T
countnonUMI = np.load('../sim_data/count.nonUMI.npy').T
labelUMI = np.load('../sim_data/label.UMI.npy')
labelnonUMI = np.load('../sim_data/label.nonUMI.npy')

UMI = GeneExpressionDataset(
            *GeneExpressionDataset.get_attributes_from_matrix(
                csr_matrix(countUMI), labels=labelUMI),
            gene_names=['gene'+str(i) for i in range(2000)], cell_types=['type'+str(i+1) for i in range(5)])

nonUMI = GeneExpressionDataset(
            *GeneExpressionDataset.get_attributes_from_matrix(
                csr_matrix(countnonUMI), labels=labelnonUMI),
            gene_names=['gene'+str(i) for i in range(2000)], cell_types=['type'+str(i+1) for i in range(5)])

if model_type in ['vae', 'svaec', 'Seurat', 'Combat']:
    gene_dataset = GeneExpressionDataset.concat_datasets(UMI, nonUMI)

    if model_type == 'vae':
        vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels,
                  n_hidden=128, n_latent=10, n_layers=2, dispersion='gene')
        infer_vae = VariationalInference(vae, gene_dataset, use_cuda=use_cuda)
        infer_vae.train(n_epochs=250)
Ejemplo n.º 12
0
 def preprocess(self):
     if os.path.isfile(self.save_path + 'regev_data.svmlight'):
         count, labels = load_svmlight_file(self.save_path +
                                            'regev_data.svmlight')
         cell_type = np.load(self.save_path + 'regev_data.celltypes.npy')
         gene_names = np.load(self.save_path + 'regev_data.gene_names.npy')
         labels_groups = np.load(self.save_path +
                                 'regev_data.labels_groups.npy')
         return (count, labels, cell_type, gene_names, labels_groups)
     else:
         regev_batches = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
         label = np.genfromtxt(self.save_path +
                               '10X_nuclei_Regev/cluster.membership.csv',
                               dtype='str',
                               delimiter=',')
         label_batch = np.asarray([
             str(int(int(x.split('-')[1].split('"')[0])))
             for x in label[1:, 0]
         ])
         label_barcode = np.asarray(
             [x.split('-')[0].split('"')[1] for x in label[1:, 0]])
         label_cluster = np.asarray([x.split('"')[1] for x in label[1:, 1]])
         label_map = np.genfromtxt(
             self.save_path + '10X_nuclei_Regev/cluster.annotation.csv',
             dtype='str',
             delimiter=',')
         label_map = dict(
             zip([x.split('"')[1] for x in label_map[:, 0]],
                 [x.split('"')[1] for x in label_map[:, 1]]))
         regev_data = []
         for batch_i, batch in enumerate(regev_batches):
             geneid, cellid, count = get_matrix_from_h5(
                 self.save_path + '10X_nuclei_Regev/' + batch +
                 '1/filtered_gene_bc_matrices_h5.h5', 'mm10-1.2.0_premrna')
             count = count.T.tocsr()
             cellid = [id.split('-')[0] for id in cellid]
             label_dict = dict(
                 zip(label_barcode[label_batch == str(batch_i + 1)],
                     label_cluster[label_batch == str(batch_i + 1)]))
             new_count, matched_label = TryFindCells(
                 label_dict, cellid, count)
             new_label = np.repeat(0, len(matched_label))
             for i, x in enumerate(np.unique(matched_label)):
                 new_label[matched_label == x] = i
             cell_type = [label_map[x] for x in np.unique(matched_label)]
             dataset = GeneExpressionDataset(
                 *GeneExpressionDataset.get_attributes_from_matrix(
                     new_count, labels=new_label),
                 gene_names=geneid,
                 cell_types=cell_type)
             print(dataset.X.shape, len(dataset.labels))
             if len(regev_data) > 0:
                 regev_data = GeneExpressionDataset.concat_datasets(
                     regev_data, dataset)
             else:
                 regev_data = dataset
         dataset = regev_data
         cell_type = dataset.cell_types
         groups = [
             'Pvalb', 'L2/3', 'Sst', 'L5 PT', 'L5 IT Tcap', 'L5 IT Aldh1a7',
             'L5 IT Foxp2', 'L5 NP', 'L6 IT', 'L6 CT', 'L6 NP', 'L6b',
             'Lamp5', 'Vip', 'Astro', 'OPC', 'VLMC', 'Oligo', 'Sncg',
             'Endo', 'SMC', 'MICRO'
         ]
         cell_type = [x.upper() for x in cell_type]
         groups = [x.upper() for x in groups]
         labels = np.asarray(
             [cell_type[x] for x in np.concatenate(dataset.labels)])
         cell_type_bygroup = np.concatenate(
             [[x for x in cell_type if x.startswith(y)] for y in groups])
         new_labels_dict = dict(
             zip(cell_type_bygroup, np.arange(len(cell_type_bygroup))))
         new_labels = np.asarray([new_labels_dict[x] for x in labels])
         labels_groups = [[
             i for i, x in enumerate(groups) if y.startswith(x)
         ][0] for y in cell_type_bygroup]
         dump_svmlight_file(dataset.X, new_labels,
                            self.save_path + 'regev_data.svmlight')
         np.save(self.save_path + 'regev_data.celltypes.npy',
                 cell_type_bygroup)
         np.save(self.save_path + 'regev_data.gene_names.npy',
                 dataset.gene_names)
         np.save(self.save_path + 'regev_data.labels_groups.npy',
                 labels_groups)
         return (dataset.X, new_labels, cell_type_bygroup,
                 dataset.gene_names, labels_groups)
Ejemplo n.º 13
0
temp = [
    "CD34+", "CD56+ NK", "CD4+/CD45RA+/CD25- Naive T", "CD4+/CD25 T Reg",
    "CD8+/CD45RA+ Naive Cytotoxic", "CD4+/CD45RO+ Memory", "CD8+ Cytotoxic T",
    "CD19+ B", "CD4+ T Helper2", "CD14+ Monocyte", "Dendritic"
]
labels_map = [6, 1, 0, 0, 4, 0, 4, 3, 0, 2, 5]
cell_type = [
    "CD4+ T Helper2", "CD56+ NK", "CD14+ Monocyte", "CD19+ B",
    "CD8+ Cytotoxic T", "Dendritic", "CD34+"
]
labels_new = deepcopy(pbmc_labels)
for i, j in enumerate(labels_map):
    labels_new[pbmc_labels == i] = j

dataset3 = GeneExpressionDataset(
    *GeneExpressionDataset.get_attributes_from_matrix(pbmc.tocsr(),
                                                      labels=labels_new),
    gene_names=genenames,
    cell_types=cell_type)

sub_dataset1 = sample_celltype(dataset1, subpop, prop)
print('total number of cells =' + str([
    np.sum(sub_dataset1.labels == i)
    for i, k in enumerate(sub_dataset1.cell_types) if k == subpop
][0]))
gene_dataset = GeneExpressionDataset.concat_datasets(sub_dataset1, dataset2,
                                                     dataset3)
gene_dataset.subsample_genes(5000)

vae = VAE(gene_dataset.nb_genes,
          n_batch=gene_dataset.n_batches,
          n_labels=gene_dataset.n_labels,