def explore_smushers(): import macosko2015 six_clusters, six_clusters_cells, six_clusters_genes = \ macosko2015.load_big_clusters() hue = 'cluster_n_celltype' groupby = six_clusters_cells[hue] palette = fig_code.cluster_names_to_color csv = macosko2015.BASE_URL + 'differential_clusters_lowrank.csv' lowrank = pd.read_csv(csv, index_col=0) lowrank_big_clusters = lowrank.loc[six_clusters.index, six_clusters.columns] algorithms = {'PCA': PCA, 'ICA': FastICA, 'NMF': NMF} def _smush(n_components): smushers = {} smusheds = {} summaries = {} datas = { 'big clusters (lowrank)': lowrank_big_clusters, 'big clusters': six_clusters } for data_name, data in datas.items(): # print('data_name', data_name) for algo_name, algorithm in algorithms.items(): # print('\talgo_name', algo_name) smusher = algorithm(n_components=n_components, random_state=2017) if data.min().min() < 0: nonnegative = data - data.min().min() else: nonnegative = data if 'digits' in data_name: # groupby = digits.target index = digits.target elif 'amacrine' in data_name: # groupby = amacrine_cells['cluster_n'] index = data.index else: # groupby = cell_metadata['cluster_n_celltype'] index = data.index smushed = pd.DataFrame(smusher.fit_transform(nonnegative), index=index) smushed.columns.name = f'{algo_name} components' median = smushed.groupby(groupby).median() mean = smushed.groupby(groupby).mean() prefix = f'{data_name} {algo_name}' smusheds[prefix] = smushed smushers[prefix] = smusher summaries[f'{prefix}: mean'] = mean summaries[f'{prefix}: median'] = median return smusheds, smushers, summaries smusheds_n10, smushers_n10, summaries_n10 = _smush(n_components=10) sns.set(style='whitegrid', context='notebook') def plot_smushed(plot_type, algorithm, statistic, lowrank, n_components): if n_components != 10: smusheds_nX, smushers_nX, summaries_nX = _smush(n_components) smusheds, smushers, summaries = \ smusheds_nX, smushers_nX, summaries_nX else: smusheds, smushers, summaries = \ smusheds_n10, smushers_n10, summaries_n10 key = 'big clusters' if lowrank: key += ' (lowrank)' key += f' {algorithm}' if plot_type == 'heatmap': # statistics = 'mean', # 'median' key_summary = f'{key}: {statistic}' summary = summaries[key_summary] fig, ax = plt.subplots() sns.heatmap(summary) ax.set(title=key_summary) if plot_type == 'pairplot': smushed = smusheds[key] smushed_clusters = smushed.join(groupby) sns.pairplot(smushed_clusters, hue=hue, palette=palette) fig = plt.gcf() fig.suptitle(key) interact(plot_smushed, plot_type=['heatmap', 'pairplot'], algorithm=algorithms.keys(), statistic=['mean', 'median'], lowrank=False, n_components=IntSlider(value=10, min=2, max=10))
def explore_phenograph(): """Interactively shows KNN graphs and community detection""" big_clusters, big_clusters_cells, big_clusters_genes = \ macosko2015.load_big_clusters() amacrine, amacrine_cells, amacrine_genes = macosko2015.load_amacrine() # --- Read the "lowrank" or "smoothed" data from robust PCA --- # csv = macosko2015.BASE_URL + 'differential_clusters_lowrank.csv' lowrank = pd.read_csv(csv, index_col=0) lowrank_big_clusters = lowrank.loc[ big_clusters.index, big_clusters.columns] lowrank_amacrine = lowrank.loc[amacrine.index, amacrine.columns] datasets = {'big clusters lowrank': lowrank_big_clusters, 'big clusters': big_clusters, 'amacrine lowrank': lowrank_amacrine, 'amacrine': amacrine} correls = {k: correlation_to_distance(data.T.rank().corr()) for k, data in datasets.items()} def plot_phenograph(dataset='big clusters', primary_metric='euclidean', lowrank=False, k=30, min_cluster_size=10): key = dataset + ' lowrank' if lowrank else dataset corr = correls[key] if dataset == 'big clusters': metadata = big_clusters_cells palette = 'Set2' cluster_col = 'cluster_id' elif dataset == 'amacrine': metadata = amacrine_cells palette = 'husl' cluster_col = 'cluster_id' community_col = 'community' communities, graph, Q = phenograph.cluster( corr, k=k, primary_metric=primary_metric, min_cluster_size=min_cluster_size) network = networkx.from_scipy_sparse_matrix(graph) positions = networkx.spring_layout(network) nodes_source = ColumnDataSource(get_nodes_specs( positions, metadata, corr.index, communities, other_cluster_col=cluster_col, community_col=community_col, palette=palette)) edges_source = ColumnDataSource(get_edges_specs(network, positions)) # --- First tab: KNN clustering --- # tab1 = plot_graph(nodes_source, edges_source, legend_col=community_col, color_col=f'{community_col}_color', tab=True, title='KNN Clustering') # --- Second tab: Clusters from paper --- # tab2 = plot_graph(nodes_source, edges_source, legend_col='cluster_n_celltype', tab=True, color_col='other_cluster_color', title="Clusters from paper") tabs = Tabs(tabs=[tab1, tab2]) show(tabs) interact(plot_phenograph, dataset=['big clusters', 'amacrine'], primary_metric=['euclidean', 'manhattan'], k=IntSlider(start=3, stop=100, value=30, step=5), min_cluster_size=IntSlider(start=3, stop=100, value=10, step=5))
import warnings import fastcluster from ipywidgets import interact, IntSlider import macosko2015 import matplotlib.pyplot as plt import numpy as np import pandas as pd import polo from scipy.spatial import distance import seaborn as sns FIGURE_FOLDER = 'figures' DATA_FOLDER = 'data' expression, cell_metadata, gene_metadata = macosko2015.load_big_clusters() cluster_ids_unique = cell_metadata['cluster_id'].unique() cluster_n_to_name = { 24: 'Rods', 25: 'Cones', 26: 'Bipolar cells\n(group1)', 27: 'Bipolar cells\n(group2)', 33: 'Bipolar cells\n(group3)', 34: 'Muller glia' } cluster_id_to_name = dict(('cluster_{}'.format(str(i).zfill(2)), name) for i, name in cluster_n_to_name.items()) colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids_unique))