Ejemplo n.º 1
0
dataset1 = assign_label(cellid, geneid, labels_map, count, cell_type, seurat)

count, geneid, cellid = get_matrix_from_dir('cite')
count = count.T.tocsr()
seurat = np.genfromtxt('../cite/cite.seurat.labels',
                       dtype='str',
                       delimiter=',')
cellid = np.asarray([x.split('-')[0] for x in cellid])
labels_map = [0, 0, 1, 2, 3, 4, 5, 6]
labels = seurat[1:, 4]
cell_type = [
    "CD4+ T Helper2", "CD56+ NK", "CD14+ Monocyte", "CD19+ B",
    "CD8+ Cytotoxic T", "FCGR3A Monocyte", "na"
]
dataset2 = assign_label(cellid, geneid, labels_map, count, cell_type, seurat)
gene_dataset = GeneExpressionDataset.concat_datasets(dataset1, dataset2)
gene_dataset.subsample_genes(5000)

if model_type == 'vae':
    vae = VAE(gene_dataset.nb_genes,
              n_batch=gene_dataset.n_batches,
              n_labels=gene_dataset.n_labels,
              n_hidden=128,
              n_latent=10,
              n_layers=2,
              dispersion='gene')
    infer_vae = VariationalInference(vae, gene_dataset, use_cuda=use_cuda)
    infer_vae.train(n_epochs=250)
    data_loader = infer_vae.data_loaders['sequential']
    latent, batch_indices, labels = get_latent(vae, data_loader)
    keys = gene_dataset.cell_types
Ejemplo n.º 2
0
                from scvi.models import SCANVI, VAE
                from umap import UMAP
                import scanpy as sc

                # TODO: import the datasets into SCVI objects (sigh!)
                # scVI wants raw counts, but who knows about those TabulaMurisSenis data
                # quick and dirty solution for now
                asubr_scvi = asubr.copy()
                asubr_scvi.X.data = asubr_scvi.X.data.astype(np.int64)
                ds_atlas = AnnDatasetFromAnnData(asubr_scvi)

                asub2_scvi = asub2.copy()
                asub2_scvi.X.data = asub2_scvi.X.data.astype(np.int64)
                ds_new = AnnDatasetFromAnnData(asub2_scvi)

                all_dataset = GeneExpressionDataset()
                all_dataset.populate_from_datasets([ds_atlas, ds_new])

                ##############################################################
                t0 = time.time()
                print('Prepare some data structures')
                vae = VAE(
                    all_dataset.nb_genes,
                    n_batch=all_dataset.n_batches,
                    n_labels=all_dataset.n_labels,
                    n_hidden=128,
                    n_latent=30,
                    n_layers=2,
                    dispersion='gene',
                )
Ejemplo n.º 3
0
try:
    COMPONENTS = int(snakemake.params['components'])
except AttributeError:
    COMPONENTS = 10  # Latent component count

try:
    LAYERS = int(snakemake.params['layers'])
except AttributeError:
    LAYERS = 1  # number of hidden layers

RECONSTRUCTION_LOSS = "nb"

batch = batch.reshape((-1, 1)).astype('int64')
cvals = counts.values.astype('int64')
zz = GeneExpressionDataset.get_attributes_from_matrix(cvals,
                                                      batch_indices=batch)

# zz[0]: int64, ndarray, genes x cells
# zz[1]: float32, ndarray, cells x 1
# zz[2]: float32, ndarray, cells x 1
# zz[3]: int64, ndarray, cells x 1

dataset = GeneExpressionDataset(*zz, gene_names=counts.columns)

n_epochs = 400
lr = 1e-3
use_batches = True
use_cuda = torch.cuda.is_available()

torch.set_num_threads(20)
# However, need to set MKL_NUM_THREADS too
simulation_full.iloc[:, 600:800].to_csv(
    "./data/count_data_simulation_v1_batch3.csv", sep=",")
simulation_full.iloc[:, 800:1000].to_csv(
    "./data/count_data_simulation_v1_batch4.csv", sep=",")

simulation_batch_1 = CsvDataset("count_data_simulation_v1_batch1.csv",
                                new_n_genes=3000)
simulation_batch_2 = CsvDataset("count_data_simulation_v1_batch2.csv",
                                new_n_genes=3000)
simulation_batch_3 = CsvDataset("count_data_simulation_v1_batch3.csv",
                                new_n_genes=3000)
simulation_batch_4 = CsvDataset("count_data_simulation_v1_batch4.csv",
                                new_n_genes=3000)

simulation_data = GeneExpressionDataset.concat_datasets(
    simulation_batch_1, simulation_batch_2, simulation_batch_3,
    simulation_batch_4)

simulation_vae = VAE(simulation_data.nb_genes,
                     n_batch=simulation_data.n_batches,
                     n_labels=simulation_data.n_labels,
                     n_hidden=128,
                     n_latent=30,
                     n_layers=2,
                     dispersion='gene')

simulation_trainer = UnsupervisedTrainer(simulation_vae,
                                         simulation_data,
                                         train_size=0.9)

simulation_trainer.train(n_epochs=100)
Ejemplo n.º 5
0
 for celltype2 in dataset2.cell_types[:6]:
     if celltype1 != celltype2:
         print(celltype1 + ' ' + celltype2)
         pbmc = deepcopy(dataset1)
         newCellType = [
             k for i, k in enumerate(dataset1.cell_types)
             if k not in [celltype1, 'Other']
         ]
         pbmc.filter_cell_types(newCellType)
         pbmc2 = deepcopy(dataset2)
         newCellType = [
             k for i, k in enumerate(dataset2.cell_types)
             if k not in [celltype2, 'Other']
         ]
         pbmc2.filter_cell_types(newCellType)
         gene_dataset = GeneExpressionDataset.concat_datasets(pbmc, pbmc2)
         # _,_,_,_,_ = run_model('writedata', gene_dataset, pbmc, pbmc2,filename=plotname+'.'
         #                                                                       +celltype1.replace(' ','')+'.'
         #                                                                       +celltype2.replace(' ',''))
         rmCellTypes = '.' + celltype1.replace(
             ' ', '') + '.' + celltype2.replace(' ', '')
         latent1 = np.genfromtxt('../harmonization/Seurat_data/' +
                                 plotname + rmCellTypes.replace(' ', '') +
                                 '.1.CCA.txt')
         latent2 = np.genfromtxt('../harmonization/Seurat_data/' +
                                 plotname + rmCellTypes.replace(' ', '') +
                                 '.2.CCA.txt')
         latent, batch_indices, labels, keys, stats = run_model(
             'readSeurat',
             gene_dataset,
             pbmc,
Ejemplo n.º 6
0
model_type = str(sys.argv[1])
plotname = 'simulation.EVF'

count = np.load('../sim_data/Sim_EVFbatch.UMI.npy')
count = count.T
meta = np.load('../sim_data/Sim_EVFbatch.meta.npy')

count_1 = count[meta[:, 2] == 0, :]
labels_1 = meta[meta[:, 2] == 0, 1]

count_2 = count[meta[:, 2] == 1, :]
labels_2 = meta[meta[:, 2] == 1, 1]

dataset1 = GeneExpressionDataset(
    *GeneExpressionDataset.get_attributes_from_matrix(csr_matrix(count_1),
                                                      labels=labels_1),
    gene_names=['gene' + str(i) for i in range(2000)],
    cell_types=['type' + str(i + 1) for i in range(5)])

dataset2 = GeneExpressionDataset(
    *GeneExpressionDataset.get_attributes_from_matrix(csr_matrix(count_2),
                                                      labels=labels_2),
    gene_names=['gene' + str(i) for i in range(2000)],
    cell_types=['type' + str(i + 1) for i in range(5)])

gene_dataset = GeneExpressionDataset.concat_datasets(dataset1, dataset2)

if model_type in ['vae', 'svaec', 'Seurat', 'Combat']:
    if model_type == 'vae':
        vae = VAE(gene_dataset.nb_genes,
                  n_batch=gene_dataset.n_batches,
Ejemplo n.º 7
0
from scvi.dataset.dataset import GeneExpressionDataset
from scvi.harmonization.benchmark import knn_purity_avg
from scvi.inference import *
from scvi.models.scanvi import SCANVI
from scvi.models.vae import VAE

model_type = 'svaec'  # str(sys.argv[1])
plotname = 'EVFbatch_simulation'

countUMI = np.load('../sim_data/count.UMI.npy').T
countnonUMI = np.load('../sim_data/count.nonUMI.npy').T
labelUMI = np.load('../sim_data/label.UMI.npy')
labelnonUMI = np.load('../sim_data/label.nonUMI.npy')

UMI = GeneExpressionDataset(
            *GeneExpressionDataset.get_attributes_from_matrix(
                csr_matrix(countUMI), labels=labelUMI),
            gene_names=['gene'+str(i) for i in range(2000)], cell_types=['type'+str(i+1) for i in range(5)])

nonUMI = GeneExpressionDataset(
            *GeneExpressionDataset.get_attributes_from_matrix(
                csr_matrix(countnonUMI), labels=labelnonUMI),
            gene_names=['gene'+str(i) for i in range(2000)], cell_types=['type'+str(i+1) for i in range(5)])

if model_type in ['vae', 'svaec', 'Seurat', 'Combat']:
    gene_dataset = GeneExpressionDataset.concat_datasets(UMI, nonUMI)

    if model_type == 'vae':
        vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels,
                  n_hidden=128, n_latent=10, n_layers=2, dispersion='gene')
        infer_vae = VariationalInference(vae, gene_dataset, use_cuda=use_cuda)
        infer_vae.train(n_epochs=250)
Ejemplo n.º 8
0
 def preprocess(self):
     if os.path.isfile(self.save_path + 'regev_data.svmlight'):
         count, labels = load_svmlight_file(self.save_path +
                                            'regev_data.svmlight')
         cell_type = np.load(self.save_path + 'regev_data.celltypes.npy')
         gene_names = np.load(self.save_path + 'regev_data.gene_names.npy')
         labels_groups = np.load(self.save_path +
                                 'regev_data.labels_groups.npy')
         return (count, labels, cell_type, gene_names, labels_groups)
     else:
         regev_batches = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
         label = np.genfromtxt(self.save_path +
                               '10X_nuclei_Regev/cluster.membership.csv',
                               dtype='str',
                               delimiter=',')
         label_batch = np.asarray([
             str(int(int(x.split('-')[1].split('"')[0])))
             for x in label[1:, 0]
         ])
         label_barcode = np.asarray(
             [x.split('-')[0].split('"')[1] for x in label[1:, 0]])
         label_cluster = np.asarray([x.split('"')[1] for x in label[1:, 1]])
         label_map = np.genfromtxt(
             self.save_path + '10X_nuclei_Regev/cluster.annotation.csv',
             dtype='str',
             delimiter=',')
         label_map = dict(
             zip([x.split('"')[1] for x in label_map[:, 0]],
                 [x.split('"')[1] for x in label_map[:, 1]]))
         regev_data = []
         for batch_i, batch in enumerate(regev_batches):
             geneid, cellid, count = get_matrix_from_h5(
                 self.save_path + '10X_nuclei_Regev/' + batch +
                 '1/filtered_gene_bc_matrices_h5.h5', 'mm10-1.2.0_premrna')
             count = count.T.tocsr()
             cellid = [id.split('-')[0] for id in cellid]
             label_dict = dict(
                 zip(label_barcode[label_batch == str(batch_i + 1)],
                     label_cluster[label_batch == str(batch_i + 1)]))
             new_count, matched_label = TryFindCells(
                 label_dict, cellid, count)
             new_label = np.repeat(0, len(matched_label))
             for i, x in enumerate(np.unique(matched_label)):
                 new_label[matched_label == x] = i
             cell_type = [label_map[x] for x in np.unique(matched_label)]
             dataset = GeneExpressionDataset(
                 *GeneExpressionDataset.get_attributes_from_matrix(
                     new_count, labels=new_label),
                 gene_names=geneid,
                 cell_types=cell_type)
             print(dataset.X.shape, len(dataset.labels))
             if len(regev_data) > 0:
                 regev_data = GeneExpressionDataset.concat_datasets(
                     regev_data, dataset)
             else:
                 regev_data = dataset
         dataset = regev_data
         cell_type = dataset.cell_types
         groups = [
             'Pvalb', 'L2/3', 'Sst', 'L5 PT', 'L5 IT Tcap', 'L5 IT Aldh1a7',
             'L5 IT Foxp2', 'L5 NP', 'L6 IT', 'L6 CT', 'L6 NP', 'L6b',
             'Lamp5', 'Vip', 'Astro', 'OPC', 'VLMC', 'Oligo', 'Sncg',
             'Endo', 'SMC', 'MICRO'
         ]
         cell_type = [x.upper() for x in cell_type]
         groups = [x.upper() for x in groups]
         labels = np.asarray(
             [cell_type[x] for x in np.concatenate(dataset.labels)])
         cell_type_bygroup = np.concatenate(
             [[x for x in cell_type if x.startswith(y)] for y in groups])
         new_labels_dict = dict(
             zip(cell_type_bygroup, np.arange(len(cell_type_bygroup))))
         new_labels = np.asarray([new_labels_dict[x] for x in labels])
         labels_groups = [[
             i for i, x in enumerate(groups) if y.startswith(x)
         ][0] for y in cell_type_bygroup]
         dump_svmlight_file(dataset.X, new_labels,
                            self.save_path + 'regev_data.svmlight')
         np.save(self.save_path + 'regev_data.celltypes.npy',
                 cell_type_bygroup)
         np.save(self.save_path + 'regev_data.gene_names.npy',
                 dataset.gene_names)
         np.save(self.save_path + 'regev_data.labels_groups.npy',
                 labels_groups)
         return (dataset.X, new_labels, cell_type_bygroup,
                 dataset.gene_names, labels_groups)
Ejemplo n.º 9
0
CLL1_d0 = CsvDataset(filename='results/scvi/input_files/CLL1_d0.csv',
                     save_path='',
                     sep=',',
                     new_n_genes=False)
CLL5_d0 = CsvDataset(filename='results/scvi/input_files/CLL5_d0.csv',
                     save_path='',
                     sep=',',
                     new_n_genes=False)

## 10X healthy (5')
healthy_10x = CsvDataset(filename='results/scvi/input_files/healthy_10x.csv',
                         save_path='',
                         sep=',',
                         new_n_genes=False)

all_dataset = GeneExpressionDataset()
all_dataset.populate_from_per_batch_list(Xs=[
    Batch_2_baseline.X, Batch_2_6m.X, Batch_2_relapse.X, Batch_3_baseline.X,
    Batch_3_6m.X, Batch_3_relapse.X, Batch_4_baseline.X, Batch_4_12m.X,
    Batch_5_baseline.X, Batch_5_6m.X, Batch_5_12m.X, Batch_6_baseline.X,
    Batch_6_relapse.X, Batch_7_baseline.X, Batch_7_relapse.X, LB6.X, RB1.X,
    RB2.X, RB3.X, CLL6_d0.X, CLL8_d0.X, CLL1_d0.X, CLL5_d0.X, healthy_10x.X
])

## Train, save and fin
vae = VAE(all_dataset.nb_genes,
          n_batch=all_dataset.n_batches,
          n_labels=all_dataset.n_labels,
          n_hidden=128,
          n_latent=30,
          n_layers=2,
LUAD_full.columns = ["sample_" + str(i) for i in range(1, 1402)]

# write count data into desired format

LUAD_full.iloc[:, 0:274].to_csv("./data/count_data_LUAD_v1_batch1.csv",
                                sep=",")
LUAD_full.iloc[:, 274:1176].to_csv("./data/count_data_LUAD_v1_batch2.csv",
                                   sep=",")
LUAD_full.iloc[:, 1176:1401].to_csv("./data/count_data_LUAD_v1_batch3.csv",
                                    sep=",")

LUAD_batch_1 = CsvDataset("count_data_LUAD_v1_batch1.csv", new_n_genes=2267)
LUAD_batch_2 = CsvDataset("count_data_LUAD_v1_batch2.csv", new_n_genes=2267)
LUAD_batch_3 = CsvDataset("count_data_LUAD_v1_batch3.csv", new_n_genes=2267)

LUAD_data = GeneExpressionDataset.concat_datasets(LUAD_batch_1, LUAD_batch_2,
                                                  LUAD_batch_3)

LUAD_vae = VAE(LUAD_data.nb_genes,
               n_batch=LUAD_data.n_batches,
               n_labels=LUAD_data.n_labels,
               n_hidden=128,
               n_latent=30,
               n_layers=2,
               dispersion='gene')

LUAD_trainer = UnsupervisedTrainer(LUAD_vae, LUAD_data, train_size=0.9)

LUAD_trainer.train(n_epochs=100)

LUAD_full = LUAD_trainer.create_posterior(LUAD_trainer.model,
                                          LUAD_data,
Ejemplo n.º 11
0
    "CD34+", "CD56+ NK", "CD4+/CD45RA+/CD25- Naive T", "CD4+/CD25 T Reg",
    "CD8+/CD45RA+ Naive Cytotoxic", "CD4+/CD45RO+ Memory", "CD8+ Cytotoxic T",
    "CD19+ B", "CD4+ T Helper2", "CD14+ Monocyte", "Dendritic"
]
labels_map = [6, 1, 0, 0, 4, 0, 4, 3, 0, 2, 5]
cell_type = [
    "CD4+ T Helper2", "CD56+ NK", "CD14+ Monocyte", "CD19+ B",
    "CD8+ Cytotoxic T", "Dendritic", "CD34+"
]
labels_new = deepcopy(pbmc_labels)
for i, j in enumerate(labels_map):
    labels_new[pbmc_labels == i] = j

dataset3 = GeneExpressionDataset(
    *GeneExpressionDataset.get_attributes_from_matrix(pbmc.tocsr(),
                                                      labels=labels_new),
    gene_names=genenames,
    cell_types=cell_type)

sub_dataset1 = sample_celltype(dataset1, subpop, prop)
print('total number of cells =' + str([
    np.sum(sub_dataset1.labels == i)
    for i, k in enumerate(sub_dataset1.cell_types) if k == subpop
][0]))
gene_dataset = GeneExpressionDataset.concat_datasets(sub_dataset1, dataset2,
                                                     dataset3)
gene_dataset.subsample_genes(5000)

vae = VAE(gene_dataset.nb_genes,
          n_batch=gene_dataset.n_batches,
          n_labels=gene_dataset.n_labels,