Beispiel #1
0
def integrate_datasets(datasets, genes, method='scanorama', merge=True):
    """
    Parameters:
    -----------
        datasets: list of scipy.sparse.csr_mstrix or np.array()

        genes: list of lists
    Returns:
    -----------
        integrated_dim_red: np.array() cell by dimensions
        corrected_expression_matrix: np.array() if merge, or lists of np.array() or scipy.sparse.csr_matrix
        corrected_genes: np.array()
    """
    if method == 'scanorama':
        import scanorama
        integrated_dim_red, corrected_expression_matrix, corrected_genes = \
            scanorama.correct(datasets, genes, dimred=100, return_dimred=True)

    if merge:
        corrected_expression_matrix = scipy.sparse.vstack(
            corrected_expression_matrix).toarray()
        integrated_dim_red = np.vstack(integrated_dim_red)

    return integrated_dim_red, corrected_expression_matrix, corrected_genes
Beispiel #2
0
def correct_scanorama(Xs, genes):
    from scanorama import correct
    Xs, genes = correct(Xs, [genes for _ in Xs], alpha=0, batch_size=10000)
    X = vstack(Xs)
    return X
                               dtype=int)

G = simulation_dim[1]
B = simulation_dim[2]

simulation_gene_list = ["gene_" + str(i) for i in range(1, (G + 1))]
simulation_gene_list = [simulation_gene_list for i in range(0, B)]

offset = 0
simulation_data = []
for b in range(B):
    simulation_data.append(
        simulation_count[offset:(offset + simulation_dim[(3 + b)]), :])
    offset += simulation_dim[(3 + b)]

simulation_integrated, simulation_corrected, simulation_genes = scanorama.correct(
    simulation_data, simulation_gene_list, return_dimred=True)

#Please note that *_corrected is of type "scipy.sparse.csr.csr_matrix".
for b in range(B):
    np.savetxt("./data/scanorama_simulation_v1_integrated_batch" + str(b + 1) +
               ".txt",
               simulation_integrated[b],
               fmt='%10.9f',
               delimiter="\t")
    np.savetxt("./data/scanorama_simulation_v1_corrected_batch" + str(b + 1) +
               ".txt",
               simulation_corrected[b].toarray(),
               fmt='%10.9f',
               delimiter="\t")
np.savetxt("scanorama_simulation_v1_genes.txt",
           simulation_genes,
Beispiel #4
0
NAMESPACE = 'er_stress'

data_names = [
    'data/pancreas/pancreas_inDrop',
    'data/pancreas/pancreas_multi_celseq2_expression_matrix',
    'data/pancreas/pancreas_multi_celseq_expression_matrix',
    'data/pancreas/pancreas_multi_fluidigmc1_expression_matrix',
    'data/pancreas/pancreas_multi_smartseq2_expression_matrix',
]

if __name__ == '__main__':
    datasets, genes_list, n_cells = load_names(data_names)
    #datasets, genes = merge_datasets(datasets, genes_list)
    #datasets_dimred, genes = process_data(datasets, genes, hvg=hvg)
    datasets, genes = correct(datasets, genes_list)
    X = vstack(datasets).toarray()
    X[X < 0] = 0

    cell_labels = (
        open('data/cell_labels/pancreas_cluster.txt').read().rstrip().split())
    er_idx = [i for i, cl in enumerate(cell_labels) if cl == 'beta_er']
    beta_idx = [i for i, cl in enumerate(cell_labels) if cl == 'beta']

    gadd_idx = list(genes).index('GADD45A')
    herp_idx = list(genes).index('HERPUD1')

    plt.figure()
    plt.boxplot([X[er_idx, gadd_idx], X[beta_idx, gadd_idx]], showmeans=True)
    plt.title('GADD45A (p = {})'.format(
        ttest_ind(X[er_idx, gadd_idx], X[beta_idx, gadd_idx],
import scanorama

pancreas_data = np.transpose(
    np.genfromtxt("../../data/count_data_pancreas_v1.txt",
                  delimiter=" ",
                  dtype=int))

pancreas_gene_list = ["gene_" + str(i) for i in range(1, 2481)]
pancreas_gene_list = [pancreas_gene_list for i in range(0, 4)]

pancreas_data = [
    pancreas_data[0:1006, 0:2480], pancreas_data[1006:3337, 0:2480],
    pancreas_data[3337:4932, 0:2480], pancreas_data[4932:7095, 0:2480]
]

pancreas_integrated, pancreas_corrected, pancreas_genes = scanorama.correct(
    pancreas_data, pancreas_gene_list, return_dimred=True)

#Please note that *_corrected is of type "scipy.sparse.csr.csr_matrix".
for b in range(4):
    np.savetxt("scanorama_pancreas_v1_integrated_batch" + str(b + 1) + ".txt",
               pancreas_integrated[b],
               fmt='%10.9f',
               delimiter="\t")
    np.savetxt("scanorama_pancreas_v1_corrected_batch" + str(b + 1) + ".txt",
               pancreas_corrected[b].toarray(),
               fmt='%10.9f',
               delimiter="\t")
np.savetxt("scanorama_pancreas_v1_genes.txt",
           pancreas_genes,
           fmt="%s",
           delimiter="\t")
Beispiel #6
0
import numpy as np
import scanorama

hemat_data = np.transpose(
    np.genfromtxt("../../data/count_data_hemat_v1.txt",
                  delimiter=" ",
                  dtype=int))

hemat_gene_list = ["gene_" + str(i) for i in range(1, 3471)]
hemat_gene_list = [hemat_gene_list for i in range(0, 2)]

hemat_data = [hemat_data[0:2729, 0:3470], hemat_data[2729:4649, 0:3470]]

hemat_integrated, hemat_corrected, hemat_genes = scanorama.correct(
    hemat_data, hemat_gene_list, return_dimred=True)

#Please note that *_corrected is of type "scipy.sparse.csr.csr_matrix".
for b in range(2):
    np.savetxt("scanorama_hemat_v1_integrated_batch" + str(b + 1) + ".txt",
               hemat_integrated[b],
               fmt='%10.9f',
               delimiter="\t")
    np.savetxt("scanorama_hemat_v1_corrected_batch" + str(b + 1) + ".txt",
               hemat_corrected[b].toarray(),
               fmt='%10.9f',
               delimiter="\t")
np.savetxt("scanorama_hemat_v1_genes.txt",
           hemat_genes,
           fmt="%s",
           delimiter="\t")
                         delimiter=" ",
                         dtype=int)

G = LUAD_dim[1]
B = LUAD_dim[2]

LUAD_gene_list = ["gene_" + str(i) for i in range(1, (G + 1))]
LUAD_gene_list = [LUAD_gene_list for i in range(0, B)]

offset = 0
LUAD_data = []
for b in range(B):
    LUAD_data.append(LUAD_count[offset:(offset + LUAD_dim[(3 + b)]), :])
    offset += LUAD_dim[(3 + b)]

LUAD_integrated, LUAD_corrected, LUAD_genes = scanorama.correct(
    LUAD_data, LUAD_gene_list, return_dimred=True)

#Please note that *_corrected is of type "scipy.sparse.csr.csr_matrix".
for b in range(B):
    np.savetxt("./data/scanorama_LUAD_v1_integrated_batch" + str(b + 1) +
               ".txt",
               LUAD_integrated[b],
               fmt='%10.9f',
               delimiter="\t")
    np.savetxt("./data/scanorama_LUAD_v1_corrected_batch" + str(b + 1) +
               ".txt",
               LUAD_corrected[b].toarray(),
               fmt='%10.9f',
               delimiter="\t")
np.savetxt("scanorama_LUAD_v1_genes.txt", LUAD_genes, fmt="%s", delimiter="\t")