def integrate_datasets(datasets, genes, method='scanorama', merge=True): """ Parameters: ----------- datasets: list of scipy.sparse.csr_mstrix or np.array() genes: list of lists Returns: ----------- integrated_dim_red: np.array() cell by dimensions corrected_expression_matrix: np.array() if merge, or lists of np.array() or scipy.sparse.csr_matrix corrected_genes: np.array() """ if method == 'scanorama': import scanorama integrated_dim_red, corrected_expression_matrix, corrected_genes = \ scanorama.correct(datasets, genes, dimred=100, return_dimred=True) if merge: corrected_expression_matrix = scipy.sparse.vstack( corrected_expression_matrix).toarray() integrated_dim_red = np.vstack(integrated_dim_red) return integrated_dim_red, corrected_expression_matrix, corrected_genes
def correct_scanorama(Xs, genes): from scanorama import correct Xs, genes = correct(Xs, [genes for _ in Xs], alpha=0, batch_size=10000) X = vstack(Xs) return X
dtype=int) G = simulation_dim[1] B = simulation_dim[2] simulation_gene_list = ["gene_" + str(i) for i in range(1, (G + 1))] simulation_gene_list = [simulation_gene_list for i in range(0, B)] offset = 0 simulation_data = [] for b in range(B): simulation_data.append( simulation_count[offset:(offset + simulation_dim[(3 + b)]), :]) offset += simulation_dim[(3 + b)] simulation_integrated, simulation_corrected, simulation_genes = scanorama.correct( simulation_data, simulation_gene_list, return_dimred=True) #Please note that *_corrected is of type "scipy.sparse.csr.csr_matrix". for b in range(B): np.savetxt("./data/scanorama_simulation_v1_integrated_batch" + str(b + 1) + ".txt", simulation_integrated[b], fmt='%10.9f', delimiter="\t") np.savetxt("./data/scanorama_simulation_v1_corrected_batch" + str(b + 1) + ".txt", simulation_corrected[b].toarray(), fmt='%10.9f', delimiter="\t") np.savetxt("scanorama_simulation_v1_genes.txt", simulation_genes,
NAMESPACE = 'er_stress' data_names = [ 'data/pancreas/pancreas_inDrop', 'data/pancreas/pancreas_multi_celseq2_expression_matrix', 'data/pancreas/pancreas_multi_celseq_expression_matrix', 'data/pancreas/pancreas_multi_fluidigmc1_expression_matrix', 'data/pancreas/pancreas_multi_smartseq2_expression_matrix', ] if __name__ == '__main__': datasets, genes_list, n_cells = load_names(data_names) #datasets, genes = merge_datasets(datasets, genes_list) #datasets_dimred, genes = process_data(datasets, genes, hvg=hvg) datasets, genes = correct(datasets, genes_list) X = vstack(datasets).toarray() X[X < 0] = 0 cell_labels = ( open('data/cell_labels/pancreas_cluster.txt').read().rstrip().split()) er_idx = [i for i, cl in enumerate(cell_labels) if cl == 'beta_er'] beta_idx = [i for i, cl in enumerate(cell_labels) if cl == 'beta'] gadd_idx = list(genes).index('GADD45A') herp_idx = list(genes).index('HERPUD1') plt.figure() plt.boxplot([X[er_idx, gadd_idx], X[beta_idx, gadd_idx]], showmeans=True) plt.title('GADD45A (p = {})'.format( ttest_ind(X[er_idx, gadd_idx], X[beta_idx, gadd_idx],
import scanorama pancreas_data = np.transpose( np.genfromtxt("../../data/count_data_pancreas_v1.txt", delimiter=" ", dtype=int)) pancreas_gene_list = ["gene_" + str(i) for i in range(1, 2481)] pancreas_gene_list = [pancreas_gene_list for i in range(0, 4)] pancreas_data = [ pancreas_data[0:1006, 0:2480], pancreas_data[1006:3337, 0:2480], pancreas_data[3337:4932, 0:2480], pancreas_data[4932:7095, 0:2480] ] pancreas_integrated, pancreas_corrected, pancreas_genes = scanorama.correct( pancreas_data, pancreas_gene_list, return_dimred=True) #Please note that *_corrected is of type "scipy.sparse.csr.csr_matrix". for b in range(4): np.savetxt("scanorama_pancreas_v1_integrated_batch" + str(b + 1) + ".txt", pancreas_integrated[b], fmt='%10.9f', delimiter="\t") np.savetxt("scanorama_pancreas_v1_corrected_batch" + str(b + 1) + ".txt", pancreas_corrected[b].toarray(), fmt='%10.9f', delimiter="\t") np.savetxt("scanorama_pancreas_v1_genes.txt", pancreas_genes, fmt="%s", delimiter="\t")
import numpy as np import scanorama hemat_data = np.transpose( np.genfromtxt("../../data/count_data_hemat_v1.txt", delimiter=" ", dtype=int)) hemat_gene_list = ["gene_" + str(i) for i in range(1, 3471)] hemat_gene_list = [hemat_gene_list for i in range(0, 2)] hemat_data = [hemat_data[0:2729, 0:3470], hemat_data[2729:4649, 0:3470]] hemat_integrated, hemat_corrected, hemat_genes = scanorama.correct( hemat_data, hemat_gene_list, return_dimred=True) #Please note that *_corrected is of type "scipy.sparse.csr.csr_matrix". for b in range(2): np.savetxt("scanorama_hemat_v1_integrated_batch" + str(b + 1) + ".txt", hemat_integrated[b], fmt='%10.9f', delimiter="\t") np.savetxt("scanorama_hemat_v1_corrected_batch" + str(b + 1) + ".txt", hemat_corrected[b].toarray(), fmt='%10.9f', delimiter="\t") np.savetxt("scanorama_hemat_v1_genes.txt", hemat_genes, fmt="%s", delimiter="\t")
delimiter=" ", dtype=int) G = LUAD_dim[1] B = LUAD_dim[2] LUAD_gene_list = ["gene_" + str(i) for i in range(1, (G + 1))] LUAD_gene_list = [LUAD_gene_list for i in range(0, B)] offset = 0 LUAD_data = [] for b in range(B): LUAD_data.append(LUAD_count[offset:(offset + LUAD_dim[(3 + b)]), :]) offset += LUAD_dim[(3 + b)] LUAD_integrated, LUAD_corrected, LUAD_genes = scanorama.correct( LUAD_data, LUAD_gene_list, return_dimred=True) #Please note that *_corrected is of type "scipy.sparse.csr.csr_matrix". for b in range(B): np.savetxt("./data/scanorama_LUAD_v1_integrated_batch" + str(b + 1) + ".txt", LUAD_integrated[b], fmt='%10.9f', delimiter="\t") np.savetxt("./data/scanorama_LUAD_v1_corrected_batch" + str(b + 1) + ".txt", LUAD_corrected[b].toarray(), fmt='%10.9f', delimiter="\t") np.savetxt("scanorama_LUAD_v1_genes.txt", LUAD_genes, fmt="%s", delimiter="\t")