Ejemplo n.º 1
0
def run_moana(trainname, testname, n):

    DataPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '.csv'
    matrix = ExpMatrix.read_tsv(DataPath, sep=',')
    LabelsPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '_label.csv'
    truelab = pd.read_csv(LabelsPath, header=0, index_col=0, sep=',')
    data = ExpMatrix(X=matrix.X, genes=matrix.genes, cells=matrix.cells)

    data.genes.name = 'Genes'
    data.cells.name = 'Cells'
    data.index.name = 'Genes'
    data.columns.name = 'Cells'

    l = CellAnnVector(cells=data.cells, data=truelab['x'].values)

    now = time.time()

    tracemalloc.start()
    clf = CellTypeClassifier()
    clf.fit(matrix=data, cell_labels=l)
    snapshot = tracemalloc.take_snapshot()
    mem_train = display_top(snapshot)

    later = time.time()
    time_train = int(later - now)

    DataPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '.csv'
    matrix = ExpMatrix.read_tsv(DataPath, sep=',')
    data = ExpMatrix(X=matrix.X, genes=matrix.genes, cells=matrix.cells)
    data.genes.name = 'Genes'
    data.cells.name = 'Cells'
    data.index.name = 'Genes'
    data.columns.name = 'Cells'

    now = time.time()

    tracemalloc.start()
    predictions = clf.predict(data)
    snapshot = tracemalloc.take_snapshot()
    mem_test = display_top(snapshot)

    later = time.time()
    time_test = int(later - now)

    predictions = np.asarray(predictions)
    pred = pd.DataFrame(predictions)

    LabelsPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '_label.csv'
    truelab = pd.read_csv(LabelsPath, header=0, index_col=None, sep=',')

    os.chdir("/dora/nobackup/yuec/scclassify/benchmark/moanna/vary_train")

    truelab.to_csv(n + "_moana_True.csv", index=False)
    pred.to_csv(n + "_moana_Pred.csv", index=False)

    return mem_train, time_train, mem_test, time_test
def run_SVMreject(trainname, testname, n):

    trainDataPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '.csv'
    trainLabelsPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '_label.csv'

    testDataPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '.csv'
    testLabelsPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '_label.csv'

    # read the data
    train = pd.read_csv(trainDataPath, index_col=0, sep=',')
    test = pd.read_csv(testDataPath, index_col=0, sep=',')

    y_train = pd.read_csv(trainLabelsPath, header=0, index_col=0, sep=',')
    y_train = y_train['x'].ravel()
    y_test = pd.read_csv(testLabelsPath, header=0, index_col=0, sep=',')
    y_test = y_test['x'].ravel()

    truelab = []
    pred = []

    train = train.transpose()
    test = test.transpose()

    now = time.time()
    tracemalloc.start()

    Classifier = LinearSVC()
    clf = CalibratedClassifierCV(Classifier)

    clf.fit(train, y_train)

    snapshot = tracemalloc.take_snapshot()
    mem_train = display_top(snapshot)

    later = time.time()
    time_train = int(later - now)

    now = time.time()
    tracemalloc.start()

    predicted = clf.predict(test)

    snapshot = tracemalloc.take_snapshot()
    mem_test = display_top(snapshot)

    later = time.time()
    time_test = int(later - now)

    prob = np.max(clf.predict_proba(test), axis=1)

    unlabeled = np.where(prob < 0.7)
    predicted[unlabeled] = 'Unassigned'

    truelab = y_test
    pred = predicted

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    os.chdir("/dora/nobackup/yuec/scclassify/benchmark/SVMreject/vary_test")

    truelab.to_csv(n + "_SVMreject_true.csv", index=False)
    pred.to_csv(n + "_SVMreject_pred.csv", index=False)

    return mem_train, time_train, mem_test, time_test
Ejemplo n.º 3
0
def run_scVI(trainname, testname, n):

    #trainDataPath = "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Segerstolpe/Filtered_Segerstolpe_HumanPancreas_data.csv"
    #train = pd.read_csv(trainDataPath,index_col=0,sep=',')
    #trainLabelsPath =  "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Segerstolpe/Labels.csv"
    #trainlabels = pd.read_csv(trainLabelsPath, header=0,index_col=None, sep=',')

    #testDataPath = "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Xin/Filtered_Xin_HumanPancreas_data.csv"
    #test = pd.read_csv(testDataPath,index_col=0,sep=',')
    #testLabelsPath =  "/Users/yue/Dropbox (Sydney Uni)/scclassify/scRNAseq_Benchmark_datasets/Pancreatic_data/Xin/Labels.csv"
    #testlabels = pd.read_csv(testLabelsPath, header=0,index_col=None, sep=',')

    train = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        trainname + '.csv',
        index_col=0,
        sep=',')
    test = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        testname + '.csv',
        index_col=0,
        sep=',')
    trainlabel = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        trainname + '_label.csv',
        header=0,
        index_col=0,
        sep=',')
    testlabel = pd.read_csv(
        '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' +
        testname + '_label.csv',
        header=0,
        index_col=0,
        sep=',')

    newdata = pd.concat([train, test], axis=1)
    newlabel = pd.concat([trainlabel, testlabel], axis=0)

    #train = '/Users/yue/Dropbox (Sydney Uni)/scclassify/countmatrix/logcount/xin.csv'

    #save labels as csv file with header and index column
    #trainlabels.to_csv('trainLabels_scvi.csv')
    #train.to_csv('trainData_scvi.csv')

    #testlabels.to_csv('testLabels_scvi.csv')
    #test.to_csv('testData_scvi.csv')

    os.chdir("/dora/nobackup/yuec/scclassify/benchmark/scVI/vary_test")

    newdata.to_csv('data_scvi.csv')
    newlabel.to_csv('labels_scvi.csv')
    data = CsvDataset('data_scvi.csv',
                      save_path="",
                      sep=",",
                      labels_file="labels_scvi.csv",
                      gene_by_cell=True)

    n_epochs = 100

    truelab = []
    pred = []

    ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing

    now = time.time()
    tracemalloc.start()

    scanvi = SCANVI(data.nb_genes, data.n_batches, data.n_labels)
    trainer_scanvi = SemiSupervisedTrainer(scanvi, data, frequency=5)

    trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(
        indices=(list(range(0, trainlabel.shape[0]))), shuffle=False)
    trainer_scanvi.labelled_set.to_monitor = ['ll', 'accuracy']
    trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
        indices=(list(
            range(trainlabel.shape[0],
                  trainlabel.shape[0] + testlabel.shape[0]))),
        shuffle=False)
    trainer_scanvi.unlabelled_set.to_monitor = ['ll', 'accuracy']

    trainer_scanvi.train(n_epochs)

    snapshot = tracemalloc.take_snapshot()
    mem_train = display_top(snapshot)

    later = time.time()
    time_train = int(later - now)

    ## labels of test set are in y_pred
    ## labels are returned in numbers, should be mapped back to the real labels
    ## indices are permutated

    now = time.time()
    tracemalloc.start()

    y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()

    snapshot = tracemalloc.take_snapshot()
    mem_test = display_top(snapshot)

    later = time.time()
    time_test = int(later - now)

    truelab.extend(y_true)
    pred.extend(y_pred)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    os.chdir("/dora/nobackup/yuec/scclassify/benchmark/scVI/vary_test")

    truelab.to_csv(n + "_scVI_True.csv", index=False)
    pred.to_csv(n + "_scVI_Pred.csv", index=False)

    return mem_train, time_train, mem_test, time_test