def run_moana(trainname, testname, n): DataPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '.csv' matrix = ExpMatrix.read_tsv(DataPath, sep=',') LabelsPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '_label.csv' truelab = pd.read_csv(LabelsPath, header=0, index_col=0, sep=',') data = ExpMatrix(X=matrix.X, genes=matrix.genes, cells=matrix.cells) data.genes.name = 'Genes' data.cells.name = 'Cells' data.index.name = 'Genes' data.columns.name = 'Cells' l = CellAnnVector(cells=data.cells, data=truelab['x'].values) now = time.time() tracemalloc.start() clf = CellTypeClassifier() clf.fit(matrix=data, cell_labels=l) snapshot = tracemalloc.take_snapshot() mem_train = display_top(snapshot) later = time.time() time_train = int(later - now) DataPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '.csv' matrix = ExpMatrix.read_tsv(DataPath, sep=',') data = ExpMatrix(X=matrix.X, genes=matrix.genes, cells=matrix.cells) data.genes.name = 'Genes' data.cells.name = 'Cells' data.index.name = 'Genes' data.columns.name = 'Cells' now = time.time() tracemalloc.start() predictions = clf.predict(data) snapshot = tracemalloc.take_snapshot() mem_test = display_top(snapshot) later = time.time() time_test = int(later - now) predictions = np.asarray(predictions) pred = pd.DataFrame(predictions) LabelsPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '_label.csv' truelab = pd.read_csv(LabelsPath, header=0, index_col=None, sep=',') os.chdir("/dora/nobackup/yuec/scclassify/benchmark/moanna/vary_train") truelab.to_csv(n + "_moana_True.csv", index=False) pred.to_csv(n + "_moana_Pred.csv", index=False) return mem_train, time_train, mem_test, time_test
def run_SVMreject(trainname, testname, n): trainDataPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '.csv' trainLabelsPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '_label.csv' testDataPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '.csv' testLabelsPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '_label.csv' # read the data train = pd.read_csv(trainDataPath, index_col=0, sep=',') test = pd.read_csv(testDataPath, index_col=0, sep=',') y_train = pd.read_csv(trainLabelsPath, header=0, index_col=0, sep=',') y_train = y_train['x'].ravel() y_test = pd.read_csv(testLabelsPath, header=0, index_col=0, sep=',') y_test = y_test['x'].ravel() truelab = [] pred = [] train = train.transpose() test = test.transpose() now = time.time() tracemalloc.start() Classifier = LinearSVC() clf = CalibratedClassifierCV(Classifier) clf.fit(train, y_train) snapshot = tracemalloc.take_snapshot() mem_train = display_top(snapshot) later = time.time() time_train = int(later - now) now = time.time() tracemalloc.start() predicted = clf.predict(test) snapshot = tracemalloc.take_snapshot() mem_test = display_top(snapshot) later = time.time() time_test = int(later - now) prob = np.max(clf.predict_proba(test), axis=1) unlabeled = np.where(prob < 0.7) predicted[unlabeled] = 'Unassigned' truelab = y_test pred = predicted truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) os.chdir("/dora/nobackup/yuec/scclassify/benchmark/SVMreject/vary_test") truelab.to_csv(n + "_SVMreject_true.csv", index=False) pred.to_csv(n + "_SVMreject_pred.csv", index=False) return mem_train, time_train, mem_test, time_test
def run_scVI(trainname, testname, n): #trainDataPath = "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Segerstolpe/Filtered_Segerstolpe_HumanPancreas_data.csv" #train = pd.read_csv(trainDataPath,index_col=0,sep=',') #trainLabelsPath = "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Segerstolpe/Labels.csv" #trainlabels = pd.read_csv(trainLabelsPath, header=0,index_col=None, sep=',') #testDataPath = "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Xin/Filtered_Xin_HumanPancreas_data.csv" #test = pd.read_csv(testDataPath,index_col=0,sep=',') #testLabelsPath = "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Xin/Labels.csv" #testlabels = pd.read_csv(testLabelsPath, header=0,index_col=None, sep=',') train = pd.read_csv( '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '.csv', index_col=0, sep=',') test = pd.read_csv( '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '.csv', index_col=0, sep=',') trainlabel = pd.read_csv( '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '_label.csv', header=0, index_col=0, sep=',') testlabel = pd.read_csv( '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '_label.csv', header=0, index_col=0, sep=',') newdata = pd.concat([train, test], axis=1) newlabel = pd.concat([trainlabel, testlabel], axis=0) #train = '/Users/yue/Dropbox (Sydney Uni)/scclassify/countmatrix/logcount/xin.csv' #save labels as csv file with header and index column #trainlabels.to_csv('trainLabels_scvi.csv') #train.to_csv('trainData_scvi.csv') #testlabels.to_csv('testLabels_scvi.csv') #test.to_csv('testData_scvi.csv') os.chdir("/dora/nobackup/yuec/scclassify/benchmark/scVI/vary_test") newdata.to_csv('data_scvi.csv') newlabel.to_csv('labels_scvi.csv') data = CsvDataset('data_scvi.csv', save_path="", sep=",", labels_file="labels_scvi.csv", gene_by_cell=True) n_epochs = 100 truelab = [] pred = [] ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing now = time.time() tracemalloc.start() scanvi = SCANVI(data.nb_genes, data.n_batches, data.n_labels) trainer_scanvi = SemiSupervisedTrainer(scanvi, data, frequency=5) trainer_scanvi.labelled_set = trainer_scanvi.create_posterior( indices=(list(range(0, trainlabel.shape[0]))), shuffle=False) trainer_scanvi.labelled_set.to_monitor = ['ll', 'accuracy'] trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior( indices=(list( range(trainlabel.shape[0], trainlabel.shape[0] + testlabel.shape[0]))), shuffle=False) trainer_scanvi.unlabelled_set.to_monitor = ['ll', 'accuracy'] trainer_scanvi.train(n_epochs) snapshot = tracemalloc.take_snapshot() mem_train = display_top(snapshot) later = time.time() time_train = int(later - now) ## labels of test set are in y_pred ## labels are returned in numbers, should be mapped back to the real labels ## indices are permutated now = time.time() tracemalloc.start() y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions() snapshot = tracemalloc.take_snapshot() mem_test = display_top(snapshot) later = time.time() time_test = int(later - now) truelab.extend(y_true) pred.extend(y_pred) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) os.chdir("/dora/nobackup/yuec/scclassify/benchmark/scVI/vary_test") truelab.to_csv(n + "_scVI_True.csv", index=False) pred.to_csv(n + "_scVI_Pred.csv", index=False) return mem_train, time_train, mem_test, time_test