Ejemplo n.º 1
0
def run_moana(trainname, testname, n):

    DataPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '.csv'
    matrix = ExpMatrix.read_tsv(DataPath, sep=',')
    LabelsPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + trainname + '_label.csv'
    truelab = pd.read_csv(LabelsPath, header=0, index_col=0, sep=',')
    data = ExpMatrix(X=matrix.X, genes=matrix.genes, cells=matrix.cells)

    data.genes.name = 'Genes'
    data.cells.name = 'Cells'
    data.index.name = 'Genes'
    data.columns.name = 'Cells'

    l = CellAnnVector(cells=data.cells, data=truelab['x'].values)

    now = time.time()

    tracemalloc.start()
    clf = CellTypeClassifier()
    clf.fit(matrix=data, cell_labels=l)
    snapshot = tracemalloc.take_snapshot()
    mem_train = display_top(snapshot)

    later = time.time()
    time_train = int(later - now)

    DataPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '.csv'
    matrix = ExpMatrix.read_tsv(DataPath, sep=',')
    data = ExpMatrix(X=matrix.X, genes=matrix.genes, cells=matrix.cells)
    data.genes.name = 'Genes'
    data.cells.name = 'Cells'
    data.index.name = 'Genes'
    data.columns.name = 'Cells'

    now = time.time()

    tracemalloc.start()
    predictions = clf.predict(data)
    snapshot = tracemalloc.take_snapshot()
    mem_test = display_top(snapshot)

    later = time.time()
    time_test = int(later - now)

    predictions = np.asarray(predictions)
    pred = pd.DataFrame(predictions)

    LabelsPath = '/albona/nobackup/biostat/datasets/singlecell/tabulaMuris_benchmark/' + testname + '_label.csv'
    truelab = pd.read_csv(LabelsPath, header=0, index_col=None, sep=',')

    os.chdir("/dora/nobackup/yuec/scclassify/benchmark/moanna/vary_train")

    truelab.to_csv(n + "_moana_True.csv", index=False)
    pred.to_csv(n + "_moana_Pred.csv", index=False)

    return mem_train, time_train, mem_test, time_test
Ejemplo n.º 2
0
def run_moana(ad, ad_ref):
    # ad_ref is the reference
    ad1 = ad_ref[:,ad_ref.var_names.isin(ad.var_names)]
    matrix1 = ExpMatrix(X=ad1.X.T.todense(), genes=ad1.var_names, cells=ad1.obs_names)
    ds = CellAnnVector(ad1.obs['cell'],cells=ad1.obs_names)

    clf = CellTypeClassifier()
    clf.fit(matrix=matrix1, cell_labels = ds)

    matrix = ExpMatrix(X=ad[:,ad1.var_names].X.T.todense(), genes=ad[:,ad1.var_names].var_names,\
                        cells=ad[:,ad1.var_names].obs_names)
    labs = clf.predict(matrix)
    return(labs.values)
Ejemplo n.º 3
0
def run_moana(input_dir, output_dir, datafile, labfile, Rfile):
    '''
    Run moana
    
    NOTE: at the moment it is only possible to run moana with a pretrained classifier,
    using the PBMC classifier is therefore hardcoded here
	
	Parameters
	----------
	input_dir : directory of the input files
	output_dir : directory of the output files
	datafile : name of the data file
    labfile : name of the label file
    Rfile : file to read the cross validation indices from
    '''

    os.chdir(input_dir)

    # read the Rdata file
    robjects.r['load'](Rfile)

    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype='bool')

    matrix = ExpMatrix.read_tsv(datafile, sep=',')
    matrix = matrix.iloc[tokeep]

    data = ExpMatrix(X=np.transpose(matrix.X),
                     genes=matrix.cells,
                     cells=matrix.genes)
    data.genes.name = 'Genes'
    data.cells.name = 'Cells'
    data.index.name = 'Genes'
    data.columns.name = 'Cells'

    clf = CellTypeClassifier.read_pickle("moana_pbmc_classifier.pickle")

    start = tm.time()
    predictions = clf.predict(data)
    runtime = tm.time() - start

    np.asarray(predictions)

    pred = pd.DataFrame(predictions)

    os.chdir(output_dir)

    pred.to_csv("Moana_pred.csv", index=False)

    with open("Moana_time.csv", 'w') as f:
        f.write("%f\n" % runtime)
Ejemplo n.º 4
0
def test_sparse(tmpdir, my_matrix):
    """Test reading/writing of sparse text format."""
    output_file = tmpdir.join('expression_matrix.mtx').strpath
    my_matrix.write_sparse(output_file)
    other = ExpMatrix.read_sparse(output_file)
    assert other is not my_matrix
    assert other == my_matrix
Ejemplo n.º 5
0
def test_tsv(tmpdir, my_matrix):
    output_file = tmpdir.join('expression_matrix.tsv').strpath
    my_matrix.write_tsv(output_file)
    # data = open(str(path), mode='rb').read()
    # h = hashlib.md5(data).hexdigest()
    # assert h == 'd34bf3d376eb613e4fea894f7c9d601f'
    other = ExpMatrix.read_tsv(output_file)
    assert other is not my_matrix
    assert other == my_matrix
Ejemplo n.º 6
0
def my_matrix(my_expression_file):
    return ExpMatrix.read_tsv(my_expression_file)
Ejemplo n.º 7
0
def run_moana(DataPath,
              LabelsPath,
              ClassifierPath,
              OutputDir,
              GeneOrderPath="",
              NumGenes=0):
    '''
    run moana
    Wrapper script to run moana on a benchmark dataset with a pretrained classifier,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    ClassifierPath : Data file path to the pretrained classifier.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''

    #    # read the Rdata file
    #    robjects.r['load'](CV_RDataPath)
    #
    #    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    #    col = np.array(robjects.r['col_Index'], dtype = 'int')
    #    col = col - 1

    matrix = ExpMatrix.read_tsv(DataPath, sep=',')
    #    matrix = matrix.iloc[tokeep]

    truelab = pd.read_csv(LabelsPath, header=0, index_col=None, sep=',')
    #    truelab = truelab.iloc[tokeep]

    ct_old = [
        'CD19+ B', 'CD14+ Monocyte', 'CD4+/CD45RA+/CD25- Naive T',
        'CD4+/CD45RO+ Memory', 'CD8+/CD45RA+ Naive Cytotoxic', 'Dendritic',
        'CD56+ NK'
    ]
    ct_new = [
        'B cells', 'CD14+ monocytes', 'Naive CD4+ T cells',
        'Memory CD4+ T cells', 'Naive CD8+ T cells', 'Dendritic cells',
        'NK cells'
    ]

    tokeep2 = np.isin(truelab, ct_old)
    truelab = truelab[tokeep2]
    print(len(truelab))
    matrix = matrix.iloc[np.squeeze(tokeep2)]

    for i in range(len(ct_old)):
        truelab.iloc[truelab == ct_old[i]] = ct_new[i]

    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,
                               header=0,
                               index_col=None,
                               sep=',')
        feat_to_use = features.iloc[0:NumGenes, 0]
        matrix = matrix.iloc[:, feat_to_use]

    data = ExpMatrix(X=np.transpose(matrix.X),
                     genes=matrix.cells,
                     cells=matrix.genes)
    data.genes.name = 'Genes'
    data.cells.name = 'Cells'
    data.index.name = 'Genes'
    data.columns.name = 'Cells'

    clf = CellTypeClassifier.read_pickle(ClassifierPath)

    start = tm.time()
    predictions = clf.predict(data)
    runtime = tm.time() - start

    np.asarray(predictions)

    pred = pd.DataFrame(predictions)

    os.chdir(OutputDir)

    if (NumGenes == 0):
        truelab.to_csv("moana_True_Labels.csv", index=False)
        pred.to_csv("moana_Pred_Labels.csv", index=False)
        with open("moana_Total_Time.csv", 'w') as f:
            f.write("%f\n" % runtime)
    else:
        truelab.to_csv("moana_" + str(NumGenes) + "_True_Labels.csv",
                       index=False)
        pred.to_csv("moana_" + str(NumGenes) + "_Pred_Labels.csv", index=False)
        with open("moana_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
            f.write("%f\n" % runtime)
Ejemplo n.º 8
0
def my_matrix(my_gene_names, my_cells, my_X):
    #genes = ['a', 'b', 'c', 'd']
    #samples = ['s1', 's2', 's3']
    # X = np.arange(12, dtype=np.float64).reshape(4, 3)
    matrix = ExpMatrix(genes=my_gene_names, cells=my_cells, X=my_X)
    return matrix