def visualize_dictionary(ct, X_dimred, genes, cell_types, namespace, cluster_method, verbose=True): from anndata import AnnData from scanorama import visualize import scanpy as sc import seaborn as sns # KNN and UMAP. if verbose: tprint('Constructing KNN graph...') adata = AnnData(X=X_dimred) sc.pp.neighbors(adata, use_rep='X') if verbose: tprint('Visualizing with UMAP...') sc.tl.umap(adata, min_dist=0.5) embedding = np.array(adata.obsm['X_umap']) embedding[embedding < -20] = -20 embedding[embedding > 20] = 20 # Visualize cell types. le = LabelEncoder().fit(cell_types) cell_types_int = le.transform(cell_types) visualize( None, cell_types_int, '{}_pan_umap_{}_type'.format(namespace, cluster_method), np.array(sorted(set(cell_types))), embedding=embedding, image_suffix='.png' ) #max_intensity = ct.labels_.max() for c_idx in range(ct.labels_.shape[1]): intensity = ct.labels_[:, c_idx] intensity /= intensity.max() print('\nCluster {}'.format(c_idx)) print_cell_types(cell_types, intensity) # Visualize cluster in UMAP coordinates. plt.figure() plt.title('Cluster {}'.format(c_idx)) plt.scatter(embedding[:, 0], embedding[:, 1], c=intensity, cmap=cm.get_cmap('Blues'), s=1) plt.savefig('{}_pan_umap_{}_cluster{}.png' .format(namespace, cluster_method, c_idx), dpi=500) plt.figure() plt.title('Cluster {}'.format(c_idx)) plt.hist(intensity.flatten(), bins=100) plt.savefig('{}_pan_umap_{}_intensehist{}.png' .format(namespace, cluster_method, c_idx), dpi=500) intensity = (intensity > 0.8) * 1 plt.figure() plt.title('Cluster {}'.format(c_idx)) plt.scatter(embedding[:, 0], embedding[:, 1], c=intensity, cmap=cm.get_cmap('Blues'), s=1) plt.savefig('{}_pan_umap_{}_member{}.png' .format(namespace, cluster_method, c_idx), dpi=500) for c_idx in range(ct.labels_.shape[1]): # Visualize covariance matrix. corr = ct.dictionary_[:, :, c_idx] corr[np.isnan(corr)] = 0 #print('\nCluster {}'.format(c_idx)) #print_gene_modules(corr, genes) gene_idx = np.sum(np.abs(corr), axis=1) > 0 if np.sum(gene_idx) == 0: continue corr = corr[gene_idx] corr = corr[:, gene_idx] plt.figure() plt.title('Cluster {}'.format(c_idx)) plt.rcParams.update({'font.size': 5}) cmap = sns.diverging_palette(220, 10, as_cmap=True) corr_max = max(corr.max(), abs(corr.min())) sns.clustermap(corr, xticklabels=genes[gene_idx], yticklabels=genes[gene_idx], cmap=cmap, vmin=-corr_max, vmax=corr_max) plt.xticks(rotation=90) plt.yticks(rotation=90) plt.savefig('{}_pan_cov_{}_cluster{}.png' .format(namespace, cluster_method, c_idx), dpi=500)
save_datasets(datasets, genes, data_names) labels = [] names = [] curr_label = 0 for i, a in enumerate(datasets): labels += list(np.zeros(a.shape[0]) + curr_label) names.append(data_names[i]) curr_label += 1 labels = np.array(labels, dtype=int) embedding = visualize(datasets_dimred, labels, NAMESPACE + '_ds', names, perplexity=600, n_iter=400, size=100) cell_labels = (open( 'data/cell_labels/293t_jurkat_cluster.txt').read().rstrip().split()) le = LabelEncoder().fit(cell_labels) labels = le.transform(cell_labels) cell_types = le.classes_ visualize(None, labels, NAMESPACE + '_type', cell_types, perplexity=600,
labels = np.array(labels, dtype=int) pbmc_genes = [ 'CD14', 'PTPRC', 'FCGR3A', 'ITGAX', 'ITGAM', 'CD19', 'HLA-DRB1', 'FCGR2B', 'FCGR2A', 'CD3E', 'CD4', 'CD8A', 'CD8B', 'CD28', 'CD8', 'TBX21', 'IKAROS', 'IL2RA', 'CD44', 'SELL', 'CCR7', 'MS4A1', 'CD68', 'CD163', 'IL5RA', 'SIGLEC8', 'KLRD1', 'NCR1', 'CD22', 'IL3RA', 'CCR6', 'IL7R', 'CD27', 'FOXP3', 'PTCRA', 'ID3', 'PF4', 'CCR10', 'SIGLEC7', 'NKG7', 'S100A8', 'CXCR3', 'CCR5', 'CCR3', 'CCR4', 'PTGDR2', 'RORC' ] embedding = visualize(datasets_dimred, labels, NAMESPACE + '_ds', names, gene_names=pbmc_genes, gene_expr=np.concatenate(datasets), genes=genes, perplexity=500, n_iter=400) cell_labels = ( open('data/cell_labels/pbmc_cluster.txt').read().rstrip().split()) le = LabelEncoder().fit(cell_labels) cell_labels = le.transform(cell_labels) cell_types = le.classes_ visualize(datasets_dimred, cell_labels, NAMESPACE + '_type', cell_types,
names = [] curr_label = 0 for i, a in enumerate(datasets): labels += list(np.zeros(a.shape[0]) + curr_label) names.append(data_names[i]) curr_label += 1 labels = np.array(labels, dtype=int) hsc_genes = [ 'GATA2', 'APOE', 'SPHK1', 'CTSE', 'FOS' ] # Visualize with PCA. visualize(None, labels, NAMESPACE + '_ds', names, gene_names=hsc_genes, genes=genes, gene_expr=np.concatenate(datasets), embedding=np.concatenate(datasets_dimred), size=4) cell_labels = ( open('data/cell_labels/hsc_cluster.txt') .read().rstrip().split() ) le = LabelEncoder().fit(cell_labels) cell_labels = le.transform(cell_labels) cell_types = le.classes_ visualize(None, cell_labels, NAMESPACE + '_type', cell_types, embedding=np.concatenate(datasets_dimred), size=4)
from scanorama import correct, visualize, process_data from scanorama import dimensionality_reduce, merge_datasets NAMESPACE = 'different' data_names = [ 'data/293t_jurkat/293t', 'data/brain/neuron_9k', 'data/hsc/hsc_mars', 'data/macrophage/uninfected', 'data/pancreas/pancreas_inDrop', 'data/pbmc/10x/68k_pbmc', ] if __name__ == '__main__': datasets, genes_list, n_cells = load_names(data_names) datasets, genes = correct(datasets, genes_list) datasets = [normalize(ds, axis=1) for ds in datasets] datasets_dimred = dimensionality_reduce(datasets) labels = [] names = [] curr_label = 0 for i, a in enumerate(datasets): labels += list(np.zeros(a.shape[0]) + curr_label) names.append(data_names[i]) curr_label += 1 labels = np.array(labels, dtype=int) visualize(datasets_dimred, labels, NAMESPACE, data_names)