def explore_smushers():
    import macosko2015
    six_clusters, six_clusters_cells, six_clusters_genes = \
        macosko2015.load_big_clusters()

    hue = 'cluster_n_celltype'
    groupby = six_clusters_cells[hue]
    palette = fig_code.cluster_names_to_color

    csv = macosko2015.BASE_URL + 'differential_clusters_lowrank.csv'
    lowrank = pd.read_csv(csv, index_col=0)

    lowrank_big_clusters = lowrank.loc[six_clusters.index,
                                       six_clusters.columns]

    algorithms = {'PCA': PCA, 'ICA': FastICA, 'NMF': NMF}

    def _smush(n_components):

        smushers = {}
        smusheds = {}

        summaries = {}
        datas = {
            'big clusters (lowrank)': lowrank_big_clusters,
            'big clusters': six_clusters
        }
        for data_name, data in datas.items():
            # print('data_name', data_name)
            for algo_name, algorithm in algorithms.items():
                # print('\talgo_name', algo_name)
                smusher = algorithm(n_components=n_components,
                                    random_state=2017)

                if data.min().min() < 0:
                    nonnegative = data - data.min().min()
                else:
                    nonnegative = data

                if 'digits' in data_name:
                    #             groupby = digits.target
                    index = digits.target
                elif 'amacrine' in data_name:
                    #             groupby = amacrine_cells['cluster_n']
                    index = data.index
                else:
                    #             groupby = cell_metadata['cluster_n_celltype']
                    index = data.index

                smushed = pd.DataFrame(smusher.fit_transform(nonnegative),
                                       index=index)
                smushed.columns.name = f'{algo_name} components'

                median = smushed.groupby(groupby).median()
                mean = smushed.groupby(groupby).mean()
                prefix = f'{data_name} {algo_name}'
                smusheds[prefix] = smushed
                smushers[prefix] = smusher

                summaries[f'{prefix}: mean'] = mean
                summaries[f'{prefix}: median'] = median
        return smusheds, smushers, summaries

    smusheds_n10, smushers_n10, summaries_n10 = _smush(n_components=10)

    sns.set(style='whitegrid', context='notebook')

    def plot_smushed(plot_type, algorithm, statistic, lowrank, n_components):

        if n_components != 10:
            smusheds_nX, smushers_nX, summaries_nX = _smush(n_components)
            smusheds, smushers, summaries = \
                smusheds_nX, smushers_nX, summaries_nX
        else:
            smusheds, smushers, summaries = \
                smusheds_n10, smushers_n10, summaries_n10

        key = 'big clusters'
        if lowrank:
            key += ' (lowrank)'
        key += f' {algorithm}'

        if plot_type == 'heatmap':
            # statistics = 'mean',  # 'median'
            key_summary = f'{key}: {statistic}'
            summary = summaries[key_summary]
            fig, ax = plt.subplots()
            sns.heatmap(summary)
            ax.set(title=key_summary)

        if plot_type == 'pairplot':
            smushed = smusheds[key]
            smushed_clusters = smushed.join(groupby)
            sns.pairplot(smushed_clusters, hue=hue, palette=palette)
            fig = plt.gcf()
            fig.suptitle(key)

    interact(plot_smushed,
             plot_type=['heatmap', 'pairplot'],
             algorithm=algorithms.keys(),
             statistic=['mean', 'median'],
             lowrank=False,
             n_components=IntSlider(value=10, min=2, max=10))
def explore_phenograph():
    """Interactively shows KNN graphs and community detection"""
    big_clusters, big_clusters_cells, big_clusters_genes = \
        macosko2015.load_big_clusters()
    amacrine, amacrine_cells, amacrine_genes = macosko2015.load_amacrine()

    # --- Read the "lowrank" or "smoothed" data from robust PCA --- #
    csv = macosko2015.BASE_URL + 'differential_clusters_lowrank.csv'
    lowrank = pd.read_csv(csv, index_col=0)

    lowrank_big_clusters = lowrank.loc[
        big_clusters.index, big_clusters.columns]

    lowrank_amacrine = lowrank.loc[amacrine.index, amacrine.columns]

    datasets = {'big clusters lowrank': lowrank_big_clusters,
                'big clusters': big_clusters,
                'amacrine lowrank': lowrank_amacrine,
                'amacrine': amacrine}

    correls = {k: correlation_to_distance(data.T.rank().corr())
               for k, data in datasets.items()}

    def plot_phenograph(dataset='big clusters',
                        primary_metric='euclidean',
                        lowrank=False,
                        k=30, min_cluster_size=10):
        key = dataset + ' lowrank' if lowrank else dataset
        corr = correls[key]

        if dataset == 'big clusters':
            metadata = big_clusters_cells
            palette = 'Set2'
            cluster_col = 'cluster_id'
        elif dataset == 'amacrine':
            metadata = amacrine_cells
            palette = 'husl'
            cluster_col = 'cluster_id'
        community_col = 'community'

        communities, graph, Q = phenograph.cluster(
            corr, k=k, primary_metric=primary_metric,
            min_cluster_size=min_cluster_size)
        network = networkx.from_scipy_sparse_matrix(graph)
        positions = networkx.spring_layout(network)

        nodes_source = ColumnDataSource(get_nodes_specs(
            positions, metadata, corr.index, communities,
            other_cluster_col=cluster_col,
            community_col=community_col, palette=palette))
        edges_source = ColumnDataSource(get_edges_specs(network, positions))

        # --- First tab: KNN clustering --- #
        tab1 = plot_graph(nodes_source, edges_source, legend_col=community_col,
                          color_col=f'{community_col}_color', tab=True,
                          title='KNN Clustering')

        # --- Second tab: Clusters from paper --- #
        tab2 = plot_graph(nodes_source, edges_source,
                          legend_col='cluster_n_celltype', tab=True,
                          color_col='other_cluster_color',
                          title="Clusters from paper")

        tabs = Tabs(tabs=[tab1, tab2])
        show(tabs)

    interact(plot_phenograph, dataset=['big clusters', 'amacrine'],
             primary_metric=['euclidean', 'manhattan'],
             k=IntSlider(start=3, stop=100, value=30, step=5),
             min_cluster_size=IntSlider(start=3, stop=100, value=10, step=5))
Beispiel #3
0
import warnings

import fastcluster
from ipywidgets import interact, IntSlider
import macosko2015
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polo
from scipy.spatial import distance
import seaborn as sns

FIGURE_FOLDER = 'figures'
DATA_FOLDER = 'data'

expression, cell_metadata, gene_metadata = macosko2015.load_big_clusters()
cluster_ids_unique = cell_metadata['cluster_id'].unique()

cluster_n_to_name = {
    24: 'Rods',
    25: 'Cones',
    26: 'Bipolar cells\n(group1)',
    27: 'Bipolar cells\n(group2)',
    33: 'Bipolar cells\n(group3)',
    34: 'Muller glia'
}

cluster_id_to_name = dict(('cluster_{}'.format(str(i).zfill(2)), name)
                          for i, name in cluster_n_to_name.items())

colors = sns.color_palette(palette='Set2', n_colors=len(cluster_ids_unique))