Example #1
0
def test_synthetic_1():
    synthetic_dataset = SyntheticDataset()
    synthetic_dataset.cell_types = np.array(["A", "B", "C"])
    svaec = SCANVI(
        synthetic_dataset.nb_genes,
        synthetic_dataset.n_batches,
        synthetic_dataset.n_labels,
    )
    trainer_synthetic_svaec = JointSemiSupervisedTrainer(svaec,
                                                         synthetic_dataset,
                                                         use_cuda=use_cuda)
    trainer_synthetic_svaec.train(n_epochs=1)
    trainer_synthetic_svaec.labelled_set.entropy_batch_mixing()

    with tempfile.TemporaryDirectory() as temp_dir:
        posterior_save_path = os.path.join(temp_dir, "posterior_data")
        original_post = trainer_synthetic_svaec.labelled_set.sequential()
        original_post.save_posterior(posterior_save_path)
        new_svaec = SCANVI(
            synthetic_dataset.nb_genes,
            synthetic_dataset.n_batches,
            synthetic_dataset.n_labels,
        )
        new_post = load_posterior(posterior_save_path,
                                  model=new_svaec,
                                  use_cuda=False)
    assert np.array_equal(new_post.indices, original_post.indices)
    assert np.array_equal(new_post.gene_dataset.X,
                          original_post.gene_dataset.X)
    assert np.array_equal(new_post.gene_dataset.labels,
                          original_post.gene_dataset.labels)

    trainer_synthetic_svaec.full_dataset.knn_purity()
    trainer_synthetic_svaec.labelled_set.show_t_sne(n_samples=5)
    trainer_synthetic_svaec.unlabelled_set.show_t_sne(n_samples=5,
                                                      color_by="labels")
    trainer_synthetic_svaec.labelled_set.show_t_sne(
        n_samples=5, color_by="batches and labels")
    trainer_synthetic_svaec.labelled_set.clustering_scores()
    trainer_synthetic_svaec.labelled_set.clustering_scores(
        prediction_algorithm="gmm")
    trainer_synthetic_svaec.unlabelled_set.unsupervised_classification_accuracy(
    )
    trainer_synthetic_svaec.unlabelled_set.differential_expression_score(
        synthetic_dataset.labels.ravel() == 1,
        synthetic_dataset.labels.ravel() == 2,
        n_samples=2,
        M_permutation=10,
    )
    trainer_synthetic_svaec.unlabelled_set.one_vs_all_degenes(n_samples=2,
                                                              M_permutation=10)
Example #2
0
def test_synthetic_1():
    synthetic_dataset = SyntheticDataset()
    synthetic_dataset.cell_types = np.array(["A", "B", "C"])
    svaec = SCANVI(
        synthetic_dataset.nb_genes,
        synthetic_dataset.n_batches,
        synthetic_dataset.n_labels,
    )
    trainer_synthetic_svaec = JointSemiSupervisedTrainer(
        svaec, synthetic_dataset, use_cuda=use_cuda
    )
    trainer_synthetic_svaec.train(n_epochs=1)
    trainer_synthetic_svaec.labelled_set.entropy_batch_mixing()
    trainer_synthetic_svaec.full_dataset.knn_purity()
    trainer_synthetic_svaec.labelled_set.show_t_sne(n_samples=5)
    trainer_synthetic_svaec.unlabelled_set.show_t_sne(n_samples=5, color_by="labels")
    trainer_synthetic_svaec.labelled_set.show_t_sne(
        n_samples=5, color_by="batches and labels"
    )
    trainer_synthetic_svaec.labelled_set.clustering_scores()
    trainer_synthetic_svaec.labelled_set.clustering_scores(prediction_algorithm="gmm")
    trainer_synthetic_svaec.unlabelled_set.unsupervised_classification_accuracy()
    trainer_synthetic_svaec.unlabelled_set.differential_expression_score(
        synthetic_dataset.labels.ravel() == 1,
        synthetic_dataset.labels.ravel() == 2,
        n_samples=2,
        M_permutation=10,
    )
    trainer_synthetic_svaec.unlabelled_set.one_vs_all_degenes(
        n_samples=2, M_permutation=10
    )
Example #3
0
def test_synthetic_1():
    synthetic_dataset = SyntheticDataset()
    synthetic_dataset.cell_types = np.array(['A', 'B', 'C'])
    svaec = SCANVI(synthetic_dataset.nb_genes, synthetic_dataset.n_batches,
                   synthetic_dataset.n_labels)
    trainer_synthetic_svaec = JointSemiSupervisedTrainer(svaec,
                                                         synthetic_dataset,
                                                         use_cuda=use_cuda)
    trainer_synthetic_svaec.train(n_epochs=1)
    trainer_synthetic_svaec.labelled_set.entropy_batch_mixing()
    trainer_synthetic_svaec.full_dataset.knn_purity(verbose=True)
    trainer_synthetic_svaec.labelled_set.show_t_sne(n_samples=5)
    trainer_synthetic_svaec.unlabelled_set.show_t_sne(n_samples=5,
                                                      color_by='labels')
    trainer_synthetic_svaec.labelled_set.show_t_sne(
        n_samples=5, color_by='batches and labels')
    trainer_synthetic_svaec.labelled_set.clustering_scores()
    trainer_synthetic_svaec.labelled_set.clustering_scores(
        prediction_algorithm='gmm')
    trainer_synthetic_svaec.unlabelled_set.unsupervised_classification_accuracy(
    )
    trainer_synthetic_svaec.unlabelled_set.differential_expression_score(
        'B', 'C', genes=['2', '4'], M_sampling=2, M_permutation=10)
    trainer_synthetic_svaec.unlabelled_set.differential_expression_table(
        M_sampling=2, M_permutation=10)
Example #4
0
def SCANVI_acc(gene_dataset:GeneExpressionDataset, plotname: str,pred1,pred2,coral1,coral2, rep='0'):
    fname = '../%s/scanvi_acc.txt'%(plotname)
    methods = ['scanvi','scanvi1','scanvi2']
    f = open(fname, "w+")
    f.write('method\t' +  "%s\t" * len(gene_dataset.cell_types) % tuple(gene_dataset.cell_types) + "\n")
    for i,method in enumerate(methods):
        vae_posterior = trainVAE(gene_dataset,plotname,rep)
        scanvi = SCANVI(gene_dataset.nb_genes, gene_dataset.n_batches, gene_dataset.n_labels, n_layers=2)
        scanvi.load_state_dict(vae_posterior.model.state_dict(), strict=False)
        if method=='scanvi1':
            trainer_scanvi = AlternateSemiSupervisedTrainer(scanvi, gene_dataset, classification_ratio=10,
                                                   n_epochs_classifier=50, lr_classification=5 * 1e-3)
            trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 0))
            trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 1))
        elif method=='scanvi2':
            trainer_scanvi = AlternateSemiSupervisedTrainer(scanvi, gene_dataset, classification_ratio=10,
                                                   n_epochs_classifier=50, lr_classification=5 * 1e-3)
            trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 1))
            trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 0))
        else:
            trainer_scanvi = SemiSupervisedTrainer(scanvi, gene_dataset, classification_ratio=50,
                                                   n_epochs_classifier=1, lr_classification=5 * 1e-3)
        trainer_scanvi.train(n_epochs=5)
        labelled_idx = trainer_scanvi.labelled_set.indices
        unlabelled_idx = trainer_scanvi.unlabelled_set.indices
        full = trainer_scanvi.create_posterior(trainer_scanvi.model, gene_dataset, indices=np.arange(len(gene_dataset)))
        labels, labels_pred = full.sequential().compute_predictions()
        shared = set(labels[labelled_idx]).intersection(set(labels[unlabelled_idx]))
        acc = [np.mean(labels_pred[unlabelled_idx][labels[unlabelled_idx] == i] == i) for i in np.unique(labels)]
        for x in np.unique(labels):
            if x not in [*shared] and method!='scanvi':
                acc[x]=-1
        f.write(method + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n")

    labels = gene_dataset.labels.ravel()
    batch = gene_dataset.batch_indices.ravel()
    acc = [np.mean(pred1[labels[batch == 1] == i] == i) for i in np.unique(labels)]
    f.write('scmap1' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n")
    acc = [np.mean(pred2[labels[batch == 0] == i] == i) for i in np.unique(labels)]
    f.write('scmap2' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n")
    acc = [np.mean(coral1[labels[batch == 1] == i] == i) for i in np.unique(labels)]
    f.write('coral1' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n")
    acc = [np.mean(coral2[labels[batch == 0] == i] == i) for i in np.unique(labels)]
    f.write('coral2' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n")
    f.close()
Example #5
0
def test_nb_not_zinb():
    synthetic_dataset = SyntheticDataset()
    svaec = SCANVI(synthetic_dataset.nb_genes,
                   synthetic_dataset.n_batches,
                   synthetic_dataset.n_labels,
                   labels_groups=[0, 0, 1],
                   reconstruction_loss="nb")
    trainer_synthetic_svaec = JointSemiSupervisedTrainer(svaec, synthetic_dataset, use_cuda=use_cuda)
    trainer_synthetic_svaec.train(n_epochs=1)
Example #6
0
def test_cortex(save_path):
    cortex_dataset = CortexDataset(save_path=save_path)
    vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches)
    trainer_cortex_vae = UnsupervisedTrainer(vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda)
    trainer_cortex_vae.train(n_epochs=1)
    trainer_cortex_vae.train_set.ll()
    trainer_cortex_vae.train_set.differential_expression_stats()

    trainer_cortex_vae.corrupt_posteriors(corruption='binomial')
    trainer_cortex_vae.corrupt_posteriors()
    trainer_cortex_vae.train(n_epochs=1)
    trainer_cortex_vae.uncorrupt_posteriors()

    trainer_cortex_vae.train_set.imputation_benchmark(n_samples=1, show_plot=False,
                                                      title_plot='imputation', save_path=save_path)

    svaec = SCANVI(cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels)
    trainer_cortex_svaec = JointSemiSupervisedTrainer(svaec, cortex_dataset,
                                                      n_labelled_samples_per_class=3,
                                                      use_cuda=use_cuda)
    trainer_cortex_svaec.train(n_epochs=1)
    trainer_cortex_svaec.labelled_set.accuracy()
    trainer_cortex_svaec.full_dataset.ll()

    svaec = SCANVI(cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels)
    trainer_cortex_svaec = AlternateSemiSupervisedTrainer(svaec, cortex_dataset,
                                                          n_labelled_samples_per_class=3,
                                                          use_cuda=use_cuda)
    trainer_cortex_svaec.train(n_epochs=1, lr=1e-2)
    trainer_cortex_svaec.unlabelled_set.accuracy()
    data_train, labels_train = trainer_cortex_svaec.labelled_set.raw_data()
    data_test, labels_test = trainer_cortex_svaec.unlabelled_set.raw_data()
    compute_accuracy_svc(data_train, labels_train, data_test, labels_test,
                         param_grid=[{'C': [1], 'kernel': ['linear']}])
    compute_accuracy_rf(data_train, labels_train, data_test, labels_test,
                        param_grid=[{'max_depth': [3], 'n_estimators': [10]}])

    cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels)
    cls_trainer = ClassifierTrainer(cls, cortex_dataset)
    cls_trainer.train(n_epochs=1)
    cls_trainer.train_set.accuracy()
Example #7
0
def test_hierarchy():
    synthetic_dataset = SyntheticDataset()
    svaec = SCANVI(
        synthetic_dataset.nb_genes,
        synthetic_dataset.n_batches,
        synthetic_dataset.n_labels,
        ontology=[
            np.array([[1, 1, 0], [0, 0, 1]]),
            np.array([[1, 0, 1, 0], [0, 0, 1, 0], [0, 0, 1, 1]])
        ],
        use_ontology=True,
        reconstruction_loss="zinb",
        n_layers=3,
    )
    trainer_synthetic_svaec = JointSemiSupervisedTrainer(svaec,
                                                         synthetic_dataset,
                                                         use_cuda=use_cuda)
    trainer_synthetic_svaec.train(n_epochs=1)
Example #8
0
def test_cortex(save_path):
    cortex_dataset = CortexDataset(save_path=save_path)
    vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches)
    trainer_cortex_vae = UnsupervisedTrainer(
        vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda
    )
    trainer_cortex_vae.train(n_epochs=1)
    trainer_cortex_vae.train_set.reconstruction_error()
    trainer_cortex_vae.train_set.differential_expression_stats()
    trainer_cortex_vae.train_set.generate_feature_correlation_matrix(
        n_samples=2, correlation_type="pearson"
    )
    trainer_cortex_vae.train_set.generate_feature_correlation_matrix(
        n_samples=2, correlation_type="spearman"
    )
    trainer_cortex_vae.train_set.imputation(n_samples=1)
    trainer_cortex_vae.test_set.imputation(n_samples=5)

    trainer_cortex_vae.corrupt_posteriors(corruption="binomial")
    trainer_cortex_vae.corrupt_posteriors()
    trainer_cortex_vae.train(n_epochs=1)
    trainer_cortex_vae.uncorrupt_posteriors()

    trainer_cortex_vae.train_set.imputation_benchmark(
        n_samples=1, show_plot=False, title_plot="imputation", save_path=save_path
    )
    trainer_cortex_vae.train_set.generate_parameters()

    n_cells, n_genes = (
        len(trainer_cortex_vae.train_set.indices),
        cortex_dataset.nb_genes,
    )
    n_samples = 3
    (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters()
    assert dropout.shape == (n_cells, n_genes) and means.shape == (n_cells, n_genes)
    assert dispersions.shape == (n_cells, n_genes)
    (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters(
        n_samples=n_samples
    )
    assert dropout.shape == (n_samples, n_cells, n_genes)
    assert means.shape == (n_samples, n_cells, n_genes,)
    (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters(
        n_samples=n_samples, give_mean=True
    )
    assert dropout.shape == (n_cells, n_genes) and means.shape == (n_cells, n_genes)

    full = trainer_cortex_vae.create_posterior(
        vae, cortex_dataset, indices=np.arange(len(cortex_dataset))
    )
    x_new, x_old = full.generate(n_samples=10)
    assert x_new.shape == (cortex_dataset.nb_cells, cortex_dataset.nb_genes, 10)
    assert x_old.shape == (cortex_dataset.nb_cells, cortex_dataset.nb_genes)

    trainer_cortex_vae.train_set.imputation_benchmark(
        n_samples=1, show_plot=False, title_plot="imputation", save_path=save_path
    )

    svaec = SCANVI(
        cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels
    )
    trainer_cortex_svaec = JointSemiSupervisedTrainer(
        svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda
    )
    trainer_cortex_svaec.train(n_epochs=1)
    trainer_cortex_svaec.labelled_set.accuracy()
    trainer_cortex_svaec.full_dataset.reconstruction_error()

    svaec = SCANVI(
        cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels
    )
    trainer_cortex_svaec = AlternateSemiSupervisedTrainer(
        svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda
    )
    trainer_cortex_svaec.train(n_epochs=1, lr=1e-2)
    trainer_cortex_svaec.unlabelled_set.accuracy()
    data_train, labels_train = trainer_cortex_svaec.labelled_set.raw_data()
    data_test, labels_test = trainer_cortex_svaec.unlabelled_set.raw_data()
    compute_accuracy_svc(
        data_train,
        labels_train,
        data_test,
        labels_test,
        param_grid=[{"C": [1], "kernel": ["linear"]}],
    )
    compute_accuracy_rf(
        data_train,
        labels_train,
        data_test,
        labels_test,
        param_grid=[{"max_depth": [3], "n_estimators": [10]}],
    )

    cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels)
    cls_trainer = ClassifierTrainer(cls, cortex_dataset)
    cls_trainer.train(n_epochs=1)
    cls_trainer.train_set.accuracy()
Example #9
0
def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run scVI
    Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
    
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep] 
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    if (NumGenes == 0):
        #save labels as csv file with header and index column
        labels.to_csv('Labels_scvi.csv')
        data.to_csv('Data_scvi.csv')    
        
        train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False)
        
        ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
        scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
        trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
    
    n_epochs = 200
    
    truelab = []
    pred = []
    tr_time = []
    ts_time = []
    
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
        
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            data2 = data.iloc[:,feat_to_use]
            
            labels.to_csv('Labels_scvi.csv')
            data2.to_csv('Data_scvi.csv')    
            
            train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False, new_n_genes = False)
            
            ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
            scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
            trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)

        trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False)
        trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy']
        trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False)
        trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy']
    
        start = tm.time()
        trainer_scanvi.train(n_epochs)
        tr_time.append(tm.time()-start)
    
        ## labels of test set are in y_pred
        ## labels are returned in numbers, should be mapped back to the real labels
        ## indices are permutated
        start = tm.time()
        y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()
        ts_time.append(tm.time()-start)
        
        truelab.extend(y_true)
        pred.extend(y_pred)
    
    #write results
    os.chdir(OutputDir)
    
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
    
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    
    if (NumGenes == 0):  
        truelab.to_csv("scVI_True_Labels.csv", index = False)
        pred.to_csv("scVI_Pred_Labels.csv", index = False)
        tr_time.to_csv("scVI_Training_Time.csv", index = False)
        ts_time.to_csv("scVI_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("scVI_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("scVI_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("scVI_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("scVI_" + str(NumGenes) + "_Testing_Time.csv", index = False)
def custom_objective_hyperopt(space,
                              is_best_training=False,
                              dataset=None,
                              n_epochs=None):
    """Custom objective function for advanced autotune tutorial."""
    space = defaultdict(dict, space)
    model_tunable_kwargs = space["model_tunable_kwargs"]
    trainer_tunable_kwargs = space["trainer_tunable_kwargs"]
    train_func_tunable_kwargs = space["train_func_tunable_kwargs"]

    trainer_specific_kwargs = {}
    model_specific_kwargs = {}
    train_func_specific_kwargs = {}
    trainer_specific_kwargs["use_cuda"] = bool(torch.cuda.device_count())
    train_func_specific_kwargs["n_epochs"] = n_epochs

    # add hardcoded parameters
    # disable scVI progbar
    trainer_specific_kwargs["show_progbar"] = False
    trainer_specific_kwargs["frequency"] = 1

    # merge params with fixed param precedence
    model_tunable_kwargs.update(model_specific_kwargs)
    trainer_tunable_kwargs.update(trainer_specific_kwargs)
    train_func_tunable_kwargs.update(train_func_specific_kwargs)

    scanvi = SCANVI(dataset.nb_genes, dataset.n_batches, dataset.n_labels,
                    **model_tunable_kwargs)
    trainer_scanvi = SemiSupervisedTrainer(scanvi, dataset,
                                           **trainer_tunable_kwargs)
    trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
        indices=np.squeeze(dataset.batch_indices == 1))
    trainer_scanvi.unlabelled_set.to_monitor = [
        "reconstruction_error", "accuracy"
    ]
    indices_labelled = np.squeeze(dataset.batch_indices == 0)

    if not is_best_training:
        # compute k-fold accuracy on a 20% validation set
        k = 5
        accuracies = np.zeros(k)
        indices_labelled = np.squeeze(dataset.batch_indices == 0)
        for i in range(k):
            indices_labelled_train, indices_labelled_val = train_test_split(
                indices_labelled.nonzero()[0], test_size=0.2)
            trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
                indices=indices_labelled_train)
            trainer_scanvi.labelled_set.to_monitor = [
                "reconstruction_error",
                "accuracy",
            ]
            trainer_scanvi.validation_set = trainer_scanvi.create_posterior(
                indices=indices_labelled_val)
            trainer_scanvi.validation_set.to_monitor = ["accuracy"]
            trainer_scanvi.train(**train_func_tunable_kwargs)
            accuracies[i] = trainer_scanvi.history["accuracy_unlabelled_set"][
                -1]
        return {
            "loss": -accuracies.mean(),
            "space": space,
            "status": STATUS_OK
        }
    else:
        trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
            indices=indices_labelled)
        trainer_scanvi.labelled_set.to_monitor = [
            "reconstruction_error", "accuracy"
        ]
        trainer_scanvi.train(**train_func_tunable_kwargs)
        return trainer_scanvi
Example #11
0
def run_scVI(input_dir, output_dir, datafile, labfile, Rfile):
    '''
    Run scVI
	
	Parameters
	----------
	input_dir : directory of the input files
	output_dir : directory of the output files
	datafile : name of the data file
    labfile : name of the label file
    Rfile : file to read the cross validation indices from
    '''
    os.chdir(input_dir)

    # read the Rdata file
    robjects.r['load'](Rfile)

    nfolds = np.array(robjects.r['n_folds'], dtype='int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype='bool')
    col = np.array(robjects.r['col_Index'], dtype='int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    os.chdir(input_dir)
    data = pd.read_csv(datafile, index_col=0, sep=',')
    labels = pd.read_csv(labfile,
                         header=0,
                         index_col=None,
                         sep=',',
                         usecols=col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    #save labels as csv file with header and index column
    labels.to_csv('Labels_scvi.csv')
    data.to_csv('Data_scvi.csv')

    train = CsvDataset('Data_scvi.csv',
                       save_path=input_dir,
                       sep=",",
                       labels_file="Labels_scvi.csv",
                       gene_by_cell=False)

    ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
    scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
    trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)

    n_epochs = 200

    truelab = []
    pred = []
    tr_time = []
    ts_time = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype='int') - 1
        train_ind_i = np.array(train_ind[i], dtype='int') - 1

        trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
            indices=(train_ind_i).ravel(), shuffle=False)
        trainer_scanvi.labelled_set.to_monitor = ['ll', 'accuracy']
        trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
            indices=(test_ind_i).ravel(), shuffle=False)
        trainer_scanvi.unlabelled_set.to_monitor = ['ll', 'accuracy']

        start = tm.time()
        trainer_scanvi.train(n_epochs)
        tr_time.append(tm.time() - start)

        ## labels of test set are in y_pred
        ## labels are returned in numbers, should be mapped back to the real labels
        ## indices are permutated
        start = tm.time()
        y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()
        ts_time.append(tm.time() - start)

        truelab.extend(y_true)
        pred.extend(y_pred)

    #write results
    os.chdir(output_dir)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    truelab.to_csv("scVI_" + str(col) + "_true.csv", index=False)
    pred.to_csv("scVI_" + str(col) + "_pred.csv", index=False)

    tr_time.to_csv("scVI_" + str(col) + "_training_time.csv", index=False)
    ts_time.to_csv("scVI_" + str(col) + "_test_time.csv", index=False)
Example #12
0
def run_scVI(trainname, testname, n):

    #trainDataPath = "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Segerstolpe/Filtered_Segerstolpe_HumanPancreas_data.csv"
    #train = pd.read_csv(trainDataPath,index_col=0,sep=',')
    #trainLabelsPath =  "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Segerstolpe/Labels.csv"
    #trainlabels = pd.read_csv(trainLabelsPath, header=0,index_col=None, sep=',')

    #testDataPath = "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Xin/Filtered_Xin_HumanPancreas_data.csv"
    #test = pd.read_csv(testDataPath,index_col=0,sep=',')
    #testLabelsPath =  "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Xin/Labels.csv"
    #testlabels = pd.read_csv(testLabelsPath, header=0,index_col=None, sep=',')

    train = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        trainname + '.csv',
        index_col=0,
        sep=',')
    test = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        testname + '.csv',
        index_col=0,
        sep=',')
    trainlabel = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        trainname + '_label.csv',
        header=0,
        index_col=0,
        sep=',')
    testlabel = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        testname + '_label.csv',
        header=0,
        index_col=0,
        sep=',')

    newdata = pd.concat([train, test], axis=1)
    newlabel = pd.concat([trainlabel, testlabel], axis=0)

    #train = '/Users/yue/Dropbox (Sydney Uni)/scclassify/countmatrix/logcount/xin.csv'

    #save labels as csv file with header and index column
    #trainlabels.to_csv('trainLabels_scvi.csv')
    #train.to_csv('trainData_scvi.csv')

    #testlabels.to_csv('testLabels_scvi.csv')
    #test.to_csv('testData_scvi.csv')

    os.chdir("/dora/nobackup/yuec/scclassify/benchmark/scVI/vary_test")

    newdata.to_csv('data_scvi.csv')
    newlabel.to_csv('labels_scvi.csv')
    data = CsvDataset('data_scvi.csv',
                      save_path="",
                      sep=",",
                      labels_file="labels_scvi.csv",
                      gene_by_cell=True)

    n_epochs = 100

    truelab = []
    pred = []

    ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing

    now = time.time()
    tracemalloc.start()

    scanvi = SCANVI(data.nb_genes, data.n_batches, data.n_labels)
    trainer_scanvi = SemiSupervisedTrainer(scanvi, data, frequency=5)

    trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
        indices=(list(range(0, trainlabel.shape[0]))), shuffle=False)
    trainer_scanvi.labelled_set.to_monitor = ['ll', 'accuracy']
    trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
        indices=(list(
            range(trainlabel.shape[0],
                  trainlabel.shape[0] + testlabel.shape[0]))),
        shuffle=False)
    trainer_scanvi.unlabelled_set.to_monitor = ['ll', 'accuracy']

    trainer_scanvi.train(n_epochs)

    snapshot = tracemalloc.take_snapshot()
    mem_train = display_top(snapshot)

    later = time.time()
    time_train = int(later - now)

    ## labels of test set are in y_pred
    ## labels are returned in numbers, should be mapped back to the real labels
    ## indices are permutated

    now = time.time()
    tracemalloc.start()

    y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()

    snapshot = tracemalloc.take_snapshot()
    mem_test = display_top(snapshot)

    later = time.time()
    time_test = int(later - now)

    truelab.extend(y_true)
    pred.extend(y_pred)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    os.chdir("/dora/nobackup/yuec/scclassify/benchmark/scVI/vary_test")

    truelab.to_csv(n + "_scVI_True.csv", index=False)
    pred.to_csv(n + "_scVI_Pred.csv", index=False)

    return mem_train, time_train, mem_test, time_test
Example #13
0
adata_test.var_names_make_unique()


# PRE-PROCESS
# First find do log1p
sc.pp.filter_genes(adata, min_cells=int(0.05 * adata.shape[0]))
sc.pp.log1p(adata)
sc.pp.log1p(adata_test)
# Then find variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="seurat")
# Label test data
adata.obs["scanvi_test"] = False
adata_test.obs["scanvi_test"] = True

# SELECT SAME GENES
genes = adata[:,adata.var.highly_variable.tolist()].var_names
genes_shared = [i in adata_test.var_names.to_list() for i in genes]
genes = genes[genes_shared]

adata = adata[:, genes.to_list()]
adata_test = adata_test[:, genes.to_list()]

adata_merged = adata.concatenate(adata_test)

# SCANVI
adata_scanvi = AnnDatasetFromAnnData(adata_merged)
scanvi = SCANVI(adata_scanvi.nb_genes, adata_scanvi.n_batches, adata_scanvi.n_labels)
trainer_scanvi = SemiSupervisedTrainer(scanvi, adata_scanvi, frequency=5)

n_epochs = 200
trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=adata_scanvi.)
Example #14
0
def runScanvi(adata, batch, labels):
    # Use non-normalized (count) data for scanvi!

    # Check for counts data layer
    if 'counts' not in adata.layers:
        raise TypeError(
            'Adata does not contain a `counts` layer in `adata.layers[`counts`]`'
        )

    from scvi.models import VAE, SCANVI
    from scvi.inference import UnsupervisedTrainer, SemiSupervisedTrainer
    from sklearn.preprocessing import LabelEncoder
    from scvi.dataset import AnnDatasetFromAnnData
    import numpy as np

    # STEP 1: prepare the data
    net_adata = adata.copy()
    net_adata.X = adata.layers['counts']
    del net_adata.layers['counts']
    # Ensure that the raw counts are not accidentally used
    del net_adata.raw  # Note that this only works from anndata 0.7

    # Define batch indices
    le = LabelEncoder()
    net_adata.obs['batch_indices'] = le.fit_transform(
        net_adata.obs[batch].values)
    net_adata.obs['labels'] = le.fit_transform(net_adata.obs[labels].values)

    net_adata = AnnDatasetFromAnnData(net_adata)

    print("scANVI dataset object with {} batches and {} cell types".format(
        net_adata.n_batches, net_adata.n_labels))

    #if hvg is True:
    #    # this also corrects for different batches by default
    #    net_adata.subsample_genes(2000, mode="seurat_v3")

    # # Defaults from SCVI github tutorials scanpy_pbmc3k and harmonization
    n_epochs_scVI = np.min([round((20000 / adata.n_obs) * 400), 400])  #400
    n_epochs_scANVI = int(np.min([10, np.max([2, round(n_epochs_scVI / 3.)])]))
    n_latent = 30
    n_hidden = 128
    n_layers = 2

    # STEP 2: RUN scVI to initialize scANVI

    vae = VAE(
        net_adata.nb_genes,
        reconstruction_loss='nb',
        n_batch=net_adata.n_batches,
        n_latent=n_latent,
        n_hidden=n_hidden,
        n_layers=n_layers,
    )

    trainer = UnsupervisedTrainer(
        vae,
        net_adata,
        train_size=1.0,
        use_cuda=False,
    )

    trainer.train(n_epochs=n_epochs_scVI, lr=1e-3)

    # STEP 3: RUN scANVI

    scanvi = SCANVI(net_adata.nb_genes,
                    net_adata.n_batches,
                    net_adata.n_labels,
                    n_hidden=n_hidden,
                    n_latent=n_latent,
                    n_layers=n_layers,
                    dispersion='gene',
                    reconstruction_loss='nb')
    scanvi.load_state_dict(trainer.model.state_dict(), strict=False)

    # use default parameter from semi-supervised trainer class
    trainer_scanvi = SemiSupervisedTrainer(scanvi, net_adata)
    # use all cells as labelled set
    trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
        trainer_scanvi.model, net_adata, indices=np.arange(len(net_adata)))
    # put one cell in the unlabelled set
    trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
        indices=[0])
    trainer_scanvi.train(n_epochs=n_epochs_scANVI)

    # extract info from posterior
    scanvi_full = trainer_scanvi.create_posterior(trainer_scanvi.model,
                                                  net_adata,
                                                  indices=np.arange(
                                                      len(net_adata)))
    latent, _, _ = scanvi_full.sequential().get_latent()

    adata.obsm['X_emb'] = latent

    return adata