dataset1 = assign_label(cellid, geneid, labels_map, count, cell_type, seurat) count, geneid, cellid = get_matrix_from_dir('cite') count = count.T.tocsr() seurat = np.genfromtxt('../cite/cite.seurat.labels', dtype='str', delimiter=',') cellid = np.asarray([x.split('-')[0] for x in cellid]) labels_map = [0, 0, 1, 2, 3, 4, 5, 6] labels = seurat[1:, 4] cell_type = [ "CD4+ T Helper2", "CD56+ NK", "CD14+ Monocyte", "CD19+ B", "CD8+ Cytotoxic T", "FCGR3A Monocyte", "na" ] dataset2 = assign_label(cellid, geneid, labels_map, count, cell_type, seurat) gene_dataset = GeneExpressionDataset.concat_datasets(dataset1, dataset2) gene_dataset.subsample_genes(5000) if model_type == 'vae': vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, n_hidden=128, n_latent=10, n_layers=2, dispersion='gene') infer_vae = VariationalInference(vae, gene_dataset, use_cuda=use_cuda) infer_vae.train(n_epochs=250) data_loader = infer_vae.data_loaders['sequential'] latent, batch_indices, labels = get_latent(vae, data_loader) keys = gene_dataset.cell_types
from scvi.models import SCANVI, VAE from umap import UMAP import scanpy as sc # TODO: import the datasets into SCVI objects (sigh!) # scVI wants raw counts, but who knows about those TabulaMurisSenis data # quick and dirty solution for now asubr_scvi = asubr.copy() asubr_scvi.X.data = asubr_scvi.X.data.astype(np.int64) ds_atlas = AnnDatasetFromAnnData(asubr_scvi) asub2_scvi = asub2.copy() asub2_scvi.X.data = asub2_scvi.X.data.astype(np.int64) ds_new = AnnDatasetFromAnnData(asub2_scvi) all_dataset = GeneExpressionDataset() all_dataset.populate_from_datasets([ds_atlas, ds_new]) ############################################################## t0 = time.time() print('Prepare some data structures') vae = VAE( all_dataset.nb_genes, n_batch=all_dataset.n_batches, n_labels=all_dataset.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion='gene', )
try: COMPONENTS = int(snakemake.params['components']) except AttributeError: COMPONENTS = 10 # Latent component count try: LAYERS = int(snakemake.params['layers']) except AttributeError: LAYERS = 1 # number of hidden layers RECONSTRUCTION_LOSS = "nb" batch = batch.reshape((-1, 1)).astype('int64') cvals = counts.values.astype('int64') zz = GeneExpressionDataset.get_attributes_from_matrix(cvals, batch_indices=batch) # zz[0]: int64, ndarray, genes x cells # zz[1]: float32, ndarray, cells x 1 # zz[2]: float32, ndarray, cells x 1 # zz[3]: int64, ndarray, cells x 1 dataset = GeneExpressionDataset(*zz, gene_names=counts.columns) n_epochs = 400 lr = 1e-3 use_batches = True use_cuda = torch.cuda.is_available() torch.set_num_threads(20) # However, need to set MKL_NUM_THREADS too
simulation_full.iloc[:, 600:800].to_csv( "./data/count_data_simulation_v1_batch3.csv", sep=",") simulation_full.iloc[:, 800:1000].to_csv( "./data/count_data_simulation_v1_batch4.csv", sep=",") simulation_batch_1 = CsvDataset("count_data_simulation_v1_batch1.csv", new_n_genes=3000) simulation_batch_2 = CsvDataset("count_data_simulation_v1_batch2.csv", new_n_genes=3000) simulation_batch_3 = CsvDataset("count_data_simulation_v1_batch3.csv", new_n_genes=3000) simulation_batch_4 = CsvDataset("count_data_simulation_v1_batch4.csv", new_n_genes=3000) simulation_data = GeneExpressionDataset.concat_datasets( simulation_batch_1, simulation_batch_2, simulation_batch_3, simulation_batch_4) simulation_vae = VAE(simulation_data.nb_genes, n_batch=simulation_data.n_batches, n_labels=simulation_data.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion='gene') simulation_trainer = UnsupervisedTrainer(simulation_vae, simulation_data, train_size=0.9) simulation_trainer.train(n_epochs=100)
for celltype2 in dataset2.cell_types[:6]: if celltype1 != celltype2: print(celltype1 + ' ' + celltype2) pbmc = deepcopy(dataset1) newCellType = [ k for i, k in enumerate(dataset1.cell_types) if k not in [celltype1, 'Other'] ] pbmc.filter_cell_types(newCellType) pbmc2 = deepcopy(dataset2) newCellType = [ k for i, k in enumerate(dataset2.cell_types) if k not in [celltype2, 'Other'] ] pbmc2.filter_cell_types(newCellType) gene_dataset = GeneExpressionDataset.concat_datasets(pbmc, pbmc2) # _,_,_,_,_ = run_model('writedata', gene_dataset, pbmc, pbmc2,filename=plotname+'.' # +celltype1.replace(' ','')+'.' # +celltype2.replace(' ','')) rmCellTypes = '.' + celltype1.replace( ' ', '') + '.' + celltype2.replace(' ', '') latent1 = np.genfromtxt('../harmonization/Seurat_data/' + plotname + rmCellTypes.replace(' ', '') + '.1.CCA.txt') latent2 = np.genfromtxt('../harmonization/Seurat_data/' + plotname + rmCellTypes.replace(' ', '') + '.2.CCA.txt') latent, batch_indices, labels, keys, stats = run_model( 'readSeurat', gene_dataset, pbmc,
model_type = str(sys.argv[1]) plotname = 'simulation.EVF' count = np.load('../sim_data/Sim_EVFbatch.UMI.npy') count = count.T meta = np.load('../sim_data/Sim_EVFbatch.meta.npy') count_1 = count[meta[:, 2] == 0, :] labels_1 = meta[meta[:, 2] == 0, 1] count_2 = count[meta[:, 2] == 1, :] labels_2 = meta[meta[:, 2] == 1, 1] dataset1 = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(csr_matrix(count_1), labels=labels_1), gene_names=['gene' + str(i) for i in range(2000)], cell_types=['type' + str(i + 1) for i in range(5)]) dataset2 = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(csr_matrix(count_2), labels=labels_2), gene_names=['gene' + str(i) for i in range(2000)], cell_types=['type' + str(i + 1) for i in range(5)]) gene_dataset = GeneExpressionDataset.concat_datasets(dataset1, dataset2) if model_type in ['vae', 'svaec', 'Seurat', 'Combat']: if model_type == 'vae': vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches,
from scvi.dataset.dataset import GeneExpressionDataset from scvi.harmonization.benchmark import knn_purity_avg from scvi.inference import * from scvi.models.scanvi import SCANVI from scvi.models.vae import VAE model_type = 'svaec' # str(sys.argv[1]) plotname = 'EVFbatch_simulation' countUMI = np.load('../sim_data/count.UMI.npy').T countnonUMI = np.load('../sim_data/count.nonUMI.npy').T labelUMI = np.load('../sim_data/label.UMI.npy') labelnonUMI = np.load('../sim_data/label.nonUMI.npy') UMI = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix( csr_matrix(countUMI), labels=labelUMI), gene_names=['gene'+str(i) for i in range(2000)], cell_types=['type'+str(i+1) for i in range(5)]) nonUMI = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix( csr_matrix(countnonUMI), labels=labelnonUMI), gene_names=['gene'+str(i) for i in range(2000)], cell_types=['type'+str(i+1) for i in range(5)]) if model_type in ['vae', 'svaec', 'Seurat', 'Combat']: gene_dataset = GeneExpressionDataset.concat_datasets(UMI, nonUMI) if model_type == 'vae': vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, n_hidden=128, n_latent=10, n_layers=2, dispersion='gene') infer_vae = VariationalInference(vae, gene_dataset, use_cuda=use_cuda) infer_vae.train(n_epochs=250)
def preprocess(self): if os.path.isfile(self.save_path + 'regev_data.svmlight'): count, labels = load_svmlight_file(self.save_path + 'regev_data.svmlight') cell_type = np.load(self.save_path + 'regev_data.celltypes.npy') gene_names = np.load(self.save_path + 'regev_data.gene_names.npy') labels_groups = np.load(self.save_path + 'regev_data.labels_groups.npy') return (count, labels, cell_type, gene_names, labels_groups) else: regev_batches = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] label = np.genfromtxt(self.save_path + '10X_nuclei_Regev/cluster.membership.csv', dtype='str', delimiter=',') label_batch = np.asarray([ str(int(int(x.split('-')[1].split('"')[0]))) for x in label[1:, 0] ]) label_barcode = np.asarray( [x.split('-')[0].split('"')[1] for x in label[1:, 0]]) label_cluster = np.asarray([x.split('"')[1] for x in label[1:, 1]]) label_map = np.genfromtxt( self.save_path + '10X_nuclei_Regev/cluster.annotation.csv', dtype='str', delimiter=',') label_map = dict( zip([x.split('"')[1] for x in label_map[:, 0]], [x.split('"')[1] for x in label_map[:, 1]])) regev_data = [] for batch_i, batch in enumerate(regev_batches): geneid, cellid, count = get_matrix_from_h5( self.save_path + '10X_nuclei_Regev/' + batch + '1/filtered_gene_bc_matrices_h5.h5', 'mm10-1.2.0_premrna') count = count.T.tocsr() cellid = [id.split('-')[0] for id in cellid] label_dict = dict( zip(label_barcode[label_batch == str(batch_i + 1)], label_cluster[label_batch == str(batch_i + 1)])) new_count, matched_label = TryFindCells( label_dict, cellid, count) new_label = np.repeat(0, len(matched_label)) for i, x in enumerate(np.unique(matched_label)): new_label[matched_label == x] = i cell_type = [label_map[x] for x in np.unique(matched_label)] dataset = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix( new_count, labels=new_label), gene_names=geneid, cell_types=cell_type) print(dataset.X.shape, len(dataset.labels)) if len(regev_data) > 0: regev_data = GeneExpressionDataset.concat_datasets( regev_data, dataset) else: regev_data = dataset dataset = regev_data cell_type = dataset.cell_types groups = [ 'Pvalb', 'L2/3', 'Sst', 'L5 PT', 'L5 IT Tcap', 'L5 IT Aldh1a7', 'L5 IT Foxp2', 'L5 NP', 'L6 IT', 'L6 CT', 'L6 NP', 'L6b', 'Lamp5', 'Vip', 'Astro', 'OPC', 'VLMC', 'Oligo', 'Sncg', 'Endo', 'SMC', 'MICRO' ] cell_type = [x.upper() for x in cell_type] groups = [x.upper() for x in groups] labels = np.asarray( [cell_type[x] for x in np.concatenate(dataset.labels)]) cell_type_bygroup = np.concatenate( [[x for x in cell_type if x.startswith(y)] for y in groups]) new_labels_dict = dict( zip(cell_type_bygroup, np.arange(len(cell_type_bygroup)))) new_labels = np.asarray([new_labels_dict[x] for x in labels]) labels_groups = [[ i for i, x in enumerate(groups) if y.startswith(x) ][0] for y in cell_type_bygroup] dump_svmlight_file(dataset.X, new_labels, self.save_path + 'regev_data.svmlight') np.save(self.save_path + 'regev_data.celltypes.npy', cell_type_bygroup) np.save(self.save_path + 'regev_data.gene_names.npy', dataset.gene_names) np.save(self.save_path + 'regev_data.labels_groups.npy', labels_groups) return (dataset.X, new_labels, cell_type_bygroup, dataset.gene_names, labels_groups)
CLL1_d0 = CsvDataset(filename='results/scvi/input_files/CLL1_d0.csv', save_path='', sep=',', new_n_genes=False) CLL5_d0 = CsvDataset(filename='results/scvi/input_files/CLL5_d0.csv', save_path='', sep=',', new_n_genes=False) ## 10X healthy (5') healthy_10x = CsvDataset(filename='results/scvi/input_files/healthy_10x.csv', save_path='', sep=',', new_n_genes=False) all_dataset = GeneExpressionDataset() all_dataset.populate_from_per_batch_list(Xs=[ Batch_2_baseline.X, Batch_2_6m.X, Batch_2_relapse.X, Batch_3_baseline.X, Batch_3_6m.X, Batch_3_relapse.X, Batch_4_baseline.X, Batch_4_12m.X, Batch_5_baseline.X, Batch_5_6m.X, Batch_5_12m.X, Batch_6_baseline.X, Batch_6_relapse.X, Batch_7_baseline.X, Batch_7_relapse.X, LB6.X, RB1.X, RB2.X, RB3.X, CLL6_d0.X, CLL8_d0.X, CLL1_d0.X, CLL5_d0.X, healthy_10x.X ]) ## Train, save and fin vae = VAE(all_dataset.nb_genes, n_batch=all_dataset.n_batches, n_labels=all_dataset.n_labels, n_hidden=128, n_latent=30, n_layers=2,
LUAD_full.columns = ["sample_" + str(i) for i in range(1, 1402)] # write count data into desired format LUAD_full.iloc[:, 0:274].to_csv("./data/count_data_LUAD_v1_batch1.csv", sep=",") LUAD_full.iloc[:, 274:1176].to_csv("./data/count_data_LUAD_v1_batch2.csv", sep=",") LUAD_full.iloc[:, 1176:1401].to_csv("./data/count_data_LUAD_v1_batch3.csv", sep=",") LUAD_batch_1 = CsvDataset("count_data_LUAD_v1_batch1.csv", new_n_genes=2267) LUAD_batch_2 = CsvDataset("count_data_LUAD_v1_batch2.csv", new_n_genes=2267) LUAD_batch_3 = CsvDataset("count_data_LUAD_v1_batch3.csv", new_n_genes=2267) LUAD_data = GeneExpressionDataset.concat_datasets(LUAD_batch_1, LUAD_batch_2, LUAD_batch_3) LUAD_vae = VAE(LUAD_data.nb_genes, n_batch=LUAD_data.n_batches, n_labels=LUAD_data.n_labels, n_hidden=128, n_latent=30, n_layers=2, dispersion='gene') LUAD_trainer = UnsupervisedTrainer(LUAD_vae, LUAD_data, train_size=0.9) LUAD_trainer.train(n_epochs=100) LUAD_full = LUAD_trainer.create_posterior(LUAD_trainer.model, LUAD_data,
"CD34+", "CD56+ NK", "CD4+/CD45RA+/CD25- Naive T", "CD4+/CD25 T Reg", "CD8+/CD45RA+ Naive Cytotoxic", "CD4+/CD45RO+ Memory", "CD8+ Cytotoxic T", "CD19+ B", "CD4+ T Helper2", "CD14+ Monocyte", "Dendritic" ] labels_map = [6, 1, 0, 0, 4, 0, 4, 3, 0, 2, 5] cell_type = [ "CD4+ T Helper2", "CD56+ NK", "CD14+ Monocyte", "CD19+ B", "CD8+ Cytotoxic T", "Dendritic", "CD34+" ] labels_new = deepcopy(pbmc_labels) for i, j in enumerate(labels_map): labels_new[pbmc_labels == i] = j dataset3 = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix(pbmc.tocsr(), labels=labels_new), gene_names=genenames, cell_types=cell_type) sub_dataset1 = sample_celltype(dataset1, subpop, prop) print('total number of cells =' + str([ np.sum(sub_dataset1.labels == i) for i, k in enumerate(sub_dataset1.cell_types) if k == subpop ][0])) gene_dataset = GeneExpressionDataset.concat_datasets(sub_dataset1, dataset2, dataset3) gene_dataset.subsample_genes(5000) vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels,