Exemple #1
0
def plot_knn(ds: loompy.LoomConnection, out_file: str) -> None:
    n_cells = ds.shape[1]
    valid = ds.col_attrs["_Valid"].astype('bool')
    (a, b, w) = ds.get_edges("MKNN", axis=1)
    mknn = sparse.coo_matrix(
        (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid]
    xy = np.vstack(
        (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :]

    fig = plt.figure(figsize=(10, 10))
    g = nx.from_scipy_sparse_matrix(mknn)
    ax = fig.add_subplot(111)

    nx.draw_networkx_edges(g, pos=xy, alpha=0.25, width=0.2, edge_color='gray')
    ax.axis('off')
    plt.tight_layout()
    fig.savefig(out_file, format="png", dpi=300)
    plt.close()
Exemple #2
0
def plot_graph(ds: loompy.LoomConnection,
               out_file: str,
               tags: List[str] = None) -> None:
    logging.info("Loading graph")
    n_cells = ds.shape[1]
    cells = np.where(ds.col_attrs["_Valid"] == 1)[0]
    has_edges = False
    if "MKNN" in ds.list_edges(axis=1):
        (a, b, w) = ds.get_edges("MKNN", axis=1)
        has_edges = True
    pos = np.vstack((ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()
    labels = ds.col_attrs["Clusters"]
    if "Outliers" in ds.col_attrs:
        outliers = ds.col_attrs["Outliers"]
    else:
        outliers = np.zeros(ds.shape[1])
    # Compute a good size for the markers, based on local density
    logging.info("Computing node size")
    min_pts = 50
    eps_pct = 60
    nn = NearestNeighbors(n_neighbors=min_pts, algorithm="ball_tree", n_jobs=4)
    nn.fit(pos)
    knn = nn.kneighbors_graph(mode='distance')
    k_radius = knn.max(axis=1).toarray()
    epsilon = 24 * np.percentile(k_radius, eps_pct)

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)

    # Draw edges
    if has_edges:
        logging.info("Drawing edges")
        lc = LineCollection(zip(pos[a], pos[b]),
                            linewidths=0.25,
                            zorder=0,
                            color='grey',
                            alpha=0.1)
        ax.add_collection(lc)

    # Draw nodes
    logging.info("Drawing nodes")
    colors20 = np.vstack((plt.cm.Vega20b(np.linspace(0., 1, 20))[::2],
                          plt.cm.Vega20c(np.linspace(0, 1, 20))[1::2]))
    plots = []
    names = []
    for i in range(max(labels) + 1):
        cluster = labels == i
        n_cells = cluster.sum()
        if np.all(outliers[labels == i] == 1):
            edgecolor = colorConverter.to_rgba('red', alpha=.1)
            plots.append(
                plt.scatter(x=pos[outliers == 1, 0],
                            y=pos[outliers == 1, 1],
                            c='grey',
                            marker='.',
                            edgecolors=edgecolor,
                            alpha=0.1,
                            s=epsilon))
            names.append(f"{i}/n={n_cells}  (outliers)")
        else:
            plots.append(
                plt.scatter(x=pos[cluster, 0],
                            y=pos[cluster, 1],
                            c=cg.colors75[np.mod(i, 75)],
                            marker='.',
                            lw=0,
                            s=epsilon,
                            alpha=0.75))
            txt = str(i)
            if "ClusterName" in ds.ca.keys():
                txt = ds.ca.ClusterName[ds.ca.Clusters == i][0]
            if tags is not None:
                names.append(f"{txt}/n={n_cells} " +
                             tags[i].replace("\n", " "))
            else:
                names.append(f"{txt}/n={n_cells}")
    logging.info("Drawing legend")
    plt.legend(plots,
               names,
               scatterpoints=1,
               markerscale=2,
               loc='upper left',
               bbox_to_anchor=(1, 1),
               fancybox=True,
               framealpha=0.5,
               fontsize=10)

    logging.info("Drawing cluster IDs")
    for lbl in range(0, max(labels) + 1):
        txt = str(lbl)
        if "ClusterName" in ds.ca.keys():
            txt = ds.ca.ClusterName[ds.ca.Clusters == lbl][0]
        if np.all(outliers[labels == lbl] == 1):
            continue
        if np.sum(labels == lbl) == 0:
            continue
        (x, y) = np.median(pos[np.where(labels == lbl)[0]], axis=0)
        ax.text(x,
                y,
                txt,
                fontsize=12,
                bbox=dict(facecolor='white', alpha=0.5, ec='none'))
    logging.info("Saving to file")
    fig.savefig(out_file, format="png", dpi=144, bbox_inches='tight')
    plt.close()
Exemple #3
0
def plot_graph_age(ds: loompy.LoomConnection, out_file: str,
                   tags: List[str]) -> None:
    def parse_age(age: str) -> float:
        if age == "":
            return 0
        unit, amount = age[0], float(age[1:])
        if unit == "P":
            amount += 19.
        return amount

    n_cells = ds.shape[1]
    valid = ds.col_attrs["_Valid"].astype('bool')

    (a, b, w) = ds.get_edges("MKNN", axis=1)
    mknn = sparse.coo_matrix(
        (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid]
    sfdp = np.vstack(
        (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :]
    # The sorting below is to make every circle visible and avoid overlappings in crowded situations
    orderx = np.argsort(sfdp[:, 0], kind="mergesort")
    ordery = np.argsort(sfdp[:, 1], kind="mergesort")
    orderfin = orderx[ordery]
    sfdp_original = sfdp.copy(
    )  # still the draw_networkx_edges wants the sfd with respect of the graph `g`
    # \it is shortcut to avoid resorting the graph
    sfdp = sfdp[orderfin, :]
    labels = ds.col_attrs["Clusters"][valid][orderfin]
    age = np.fromiter(map(parse_age, ds.col_attrs["Age"]),
                      dtype=float)[valid][orderfin]

    fig = plt.figure(figsize=(10, 10))
    g = nx.from_scipy_sparse_matrix(mknn)
    ax = fig.add_subplot(111)

    # Draw the KNN graph first, with gray transparent edges
    nx.draw_networkx_edges(g,
                           pos=sfdp_original,
                           alpha=0.1,
                           width=0.1,
                           edge_color='gray')
    # Then draw the nodes, colored by label
    block_colors = plt.cm.nipy_spectral_r((age - 6) / 14.)
    nx.draw_networkx_nodes(g,
                           pos=sfdp,
                           node_color=block_colors,
                           node_size=10,
                           alpha=0.4,
                           linewidths=0)

    for lbl in range(0, max(labels) + 1):
        if np.sum(labels == lbl) == 0:
            continue
        (x, y) = np.median(sfdp[np.where(labels == lbl)[0]], axis=0)
        text = "#" + str(lbl)
        if len(tags[lbl]) > 0:
            text += "\n" + tags[lbl]
        ax.text(x,
                y,
                text,
                fontsize=8,
                bbox=dict(facecolor='gray', alpha=0.3, ec='none'))
    ax.axis('off')
    levels = np.unique(age)
    for il, lev in enumerate(levels):
        ax.add_patch(
            plt.Rectangle((0.90, 0.7 + il * 0.016),
                          0.014,
                          0.014,
                          color=plt.cm.nipy_spectral_r((lev - 6) / 14.),
                          clip_on=0,
                          transform=ax.transAxes))
        ax.text(0.93,
                0.703 + il * 0.016,
                ("E%.1f" % lev if lev < 18.5 else "P%.1f" % (lev - 19)),
                transform=ax.transAxes)
    plt.tight_layout()
    fig.savefig(out_file, format="png", dpi=300)
    plt.close()
Exemple #4
0
def plot_classification(ds: loompy.LoomConnection, out_file: str) -> None:
    n_cells = ds.shape[1]
    valid = ds.col_attrs["_Valid"].astype('bool')
    (a, b, w) = ds.get_edges("MKNN", axis=1)
    mknn = sparse.coo_matrix(
        (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid]
    pos = np.vstack(
        (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :]
    labels = ds.col_attrs["Clusters"][valid]

    fig = plt.figure(figsize=(10, 10))
    g = nx.from_scipy_sparse_matrix(mknn)
    classes = [
        "Neurons", "Astrocyte", "Ependymal", "OEC", "Oligos", "Schwann",
        "Cycling", "Vascular", "Immune"
    ]
    colors = [plt.cm.get_cmap('Vega20')((ix + 0.5) / 20) for ix in range(20)]

    combined_colors = np.zeros((ds.shape[1], 4)) + np.array((0.5, 0.5, 0.5, 0))

    for ix, cls in enumerate(classes):
        cmap = LinearSegmentedColormap.from_list('custom cmap',
                                                 [(1, 1, 1, 0), colors[ix]])
        cells = ds.col_attrs["Class0"] == classes[ix]
        if np.sum(cells) > 0:
            combined_colors[cells] = [
                cmap(x) for x in ds.col_attrs["Class_" + classes[ix]][cells]
            ]

    cmap = LinearSegmentedColormap.from_list('custom cmap',
                                             [(1, 1, 1, 0), colors[ix + 1]])
    ery_color = np.array(
        [[1, 1, 1, 0],
         [0.9, 0.71, 0.76,
          0]])[(ds.col_attrs["Class"][valid] == "Erythrocyte").astype('int')]
    cells = ds.col_attrs["Class0"] == "Erythrocyte"
    if np.sum(cells) > 0:
        combined_colors[cells] = np.array([1, 0.71, 0.76, 0])

    cmap = LinearSegmentedColormap.from_list('custom cmap',
                                             [(1, 1, 1, 0), colors[ix + 2]])
    exc_color = np.array(
        [[1, 1, 1, 0],
         [0.5, 0.5, 0.5,
          0]])[(ds.col_attrs["Class0"][valid] == "Excluded").astype('int')]
    cells = ds.col_attrs["Class0"] == "Excluded"
    if np.sum(cells) > 0:
        combined_colors[cells] = np.array([0.5, 0.5, 0.5, 0])

    ax = fig.add_subplot(1, 1, 1)
    ax.set_title("Class")
    nx.draw_networkx_edges(g, pos=pos, alpha=0.2, width=0.1, edge_color='gray')
    nx.draw_networkx_nodes(g,
                           pos=pos,
                           node_color=combined_colors[valid],
                           node_size=10,
                           alpha=0.6,
                           linewidths=0)
    ax.axis('off')

    plt.tight_layout()
    fig.savefig(out_file, format="png", dpi=300)
    plt.close()
Exemple #5
0
    def fit_predict(self, ds: loompy.LoomConnection) -> np.ndarray:
        n_valid = np.sum(ds.col_attrs["_Valid"] == 1)
        n_total = ds.shape[1]
        logging.info("%d of %d cells were valid", n_valid, n_total)
        logging.info("%d of %d genes were valid",
                     np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0])
        cells = np.where(ds.col_attrs["_Valid"] == 1)[0]

        if self.method == "hdbscan":
            logging.info("HDBSCAN clustering in t-SNE space")
            min_pts = 10 if n_valid < 3000 else (
                20 if n_valid < 20000 else 100)
            tsne_pos = np.vstack(
                (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[cells, :]
            clusterer = hdbscan.HDBSCAN(min_cluster_size=min_pts)
            labels = clusterer.fit_predict(tsne_pos)
        elif self.method == "dbscan":
            logging.info("DBSCAN clustering in t-SNE space")
            if self.min_pts is None:
                self.min_pts = 10 if n_valid < 3000 else (
                    20 if n_valid < 20000 else 100)
            tsne_pos = np.vstack(
                (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[cells, :]

            # Determine a good epsilon
            nn = NearestNeighbors(n_neighbors=self.min_pts,
                                  algorithm="ball_tree",
                                  n_jobs=4)
            nn.fit(tsne_pos)
            knn = nn.kneighbors_graph(mode='distance')
            k_radius = knn.max(axis=1).toarray()
            epsilon = np.percentile(k_radius, self.eps_pct)

            clusterer = DBSCAN(eps=epsilon, min_samples=self.min_pts)
            labels = clusterer.fit_predict(tsne_pos)
            if not self.outliers:
                # Assign each outlier to the same cluster as the nearest non-outlier
                nn = NearestNeighbors(n_neighbors=50, algorithm="ball_tree")
                nn.fit(tsne_pos[labels >= 0])
                nearest = nn.kneighbors(tsne_pos[labels == -1],
                                        n_neighbors=1,
                                        return_distance=False)
                labels[labels == -1] = labels[labels >= 0][nearest.flat[:]]
        elif self.method == "multilev":
            logging.info(
                "comunity-multilevel clustering on unweighted KNN graph")
            (a, b, w) = ds.get_edges("KNN", axis=1)
            # knn = sparse.coo_matrix((w, (a, b)), shape=(ds.shape[1], ds.shape[1])).tocsr()[cells, :][:, cells]
            # sources, targets = knn.nonzero()
            G = igraph.Graph(n_total, list(zip(a, b)), directed=False)
            VxCl = G.community_multilevel(return_levels=False)
            labels = np.array(VxCl.membership)
        elif self.method == "wmultilev":
            logging.info(
                "comunity-multilevel clustering on the multiscale KNN graph")
            (a, b, w) = ds.get_edges("KNN", axis=1)
            # knn = sparse.coo_matrix((w, (a, b)), shape=(ds.shape[1], ds.shape[1])).tocsr()[cells, :][:, cells]
            # a, b = knn.nonzero()
            G = igraph.Graph(n_total,
                             list(zip(a, b)),
                             directed=False,
                             edge_attrs={'weight': w})
            VxCl = G.community_multilevel(return_levels=False,
                                          weights="weight")
            labels = np.array(VxCl.membership)
        elif self.method == "mknn_louvain":
            logging.info(
                "comunity-multilevel clustering on the multiscale MKNN graph")
            (a, b, w) = ds.get_edges("MKNN", axis=1)
            random.seed(13)
            igraph._igraph.set_random_number_generator(random)
            G = igraph.Graph(n_total,
                             list(zip(a, b)),
                             directed=False,
                             edge_attrs={'weight': w})
            VxCl = G.community_multilevel(return_levels=False,
                                          weights="weight")
            labels = np.array(VxCl.membership)
            logging.info(f"labels.shape = {labels.shape}")
            if not self.outliers:
                bigs = np.where(np.bincount(labels) >= 0)[0]
            else:
                bigs = np.where(np.bincount(labels) >= self.min_pts)[0]
            mapping = {k: v for v, k in enumerate(bigs)}
            labels = np.array(
                [mapping[x] if x in bigs else -1 for x in labels])
        else:
            logging.info("Louvain clustering on the multiscale KNN graph")
            (a, b, w) = ds.get_edges("KNN", axis=1)
            knn = sparse.coo_matrix(
                (w, (a, b)),
                shape=(ds.shape[1], ds.shape[1])).tocsr()[cells, :][:, cells]
            lj = cg.LouvainJaccard(resolution=1, jaccard=False)
            labels = lj.fit_predict(knn.tocoo())

        # At this point, cells should be labeled 0, 1, 2, ...
        # But there may also be cells labelled -1 for outliers, which we want to keep track of
        labels_all = np.zeros(ds.shape[1], dtype='int')
        outliers = np.zeros(ds.shape[1], dtype='int')
        labels_all[cells] = labels
        outliers[labels_all == -1] = 1
        labels_all[cells] = labels - np.min(labels)
        ds.ca.Clusters = labels_all
        ds.ca.Outliers = outliers
        logging.info("Found " + str(max(labels_all) + 1) + " clusters")
        if not len(set(ds.ca.Clusters)) == ds.ca.Clusters.max() + 1:
            raise ValueError("There are holes in the cluster ID sequence!")
        return labels_all