コード例 #1
0
	def fit(self, ds: loompy.LoomConnection) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
		"""
		Discover the manifold
		Args:
			n_genes		Number of genes to use for manifold learning (ignored if genes is not None)
			gtsnse		Use graph t-SNE for layout (default: standard tSNE)
			alpha		The scale parameter for multiscale KNN
			genes		List of genes to use for manifold learning

		Returns:
			knn		The multiscale knn graph as a sparse matrix, with k = 100
			mknn	Mutual knn subgraph, with k = 20
			pos		2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2)
		"""
		n_valid = np.sum(ds.col_attrs["_Valid"] == 1)
		n_total = ds.shape[1]
		logging.info("%d of %d cells were valid", n_valid, n_total)
		logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0])
		cells = np.where(ds.col_attrs["_Valid"] == 1)[0]

		logging.info("Normalization")
		normalizer = cg.Normalizer(False)
		normalizer.fit(ds)

		if self.filter_cellcycle is not None:
			cell_cycle_genes = np.array(open(self.filter_cellcycle).read().split())
			mask = np.in1d(ds.ra.Gene, cell_cycle_genes)
			if np.sum(mask) == 0:
				logging.warn("None cell cycle genes where filtered, check your gene list")
		else:
			mask = None

		if self.genes is None:
			logging.info("Selecting up to %d genes", self.n_genes)
			genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask)
			temp = np.zeros(ds.shape[0])
			temp[genes] = 1
			ds.set_attr("_Selected", temp, axis=0)
			logging.info("%d genes selected", temp.sum())

			n_components = min(50, n_valid)
			logging.info("PCA projection to %d components", n_components)
			pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer)
			pca_transformed = pca.fit_transform(ds, normalizer, cells=cells)
			transformed = pca_transformed

			logging.info("Generating KNN graph")
			k = min(10, n_valid - 1)
			nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
			nn.fit(transformed)
			knn = nn.kneighbors_graph(mode='connectivity')
			knn = knn.tocoo()
			mknn = knn.minimum(knn.transpose()).tocoo()

			logging.info("Louvain-Jaccard clustering")
			lj = cg.LouvainJaccard(resolution=1)
			labels = lj.fit_predict(knn)

			# Make labels for excluded cells == -1
			labels_all = np.zeros(ds.shape[1], dtype='int') + -1
			labels_all[cells] = labels
			ds.set_attr("Clusters", labels_all, axis=1)
			n_labels = np.max(labels) + 1
			logging.info("Found " + str(n_labels) + " LJ clusters")

			logging.info("Marker selection")
			(genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels), mask=mask).fit(ds)
		else:
			genes = self.genes

		temp = np.zeros(ds.shape[0])
		temp[genes] = 1
		ds.set_attr("_Selected", temp, axis=0)
		logging.info("%d genes selected", temp.sum())

		n_components = min(50, n_valid)
		logging.info("PCA projection to %d components", n_components)
		pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer)
		pca_transformed = pca.fit_transform(ds, normalizer, cells=cells)
		transformed = pca_transformed

		logging.info("Generating KNN graph")
		k = min(10, n_valid - 1)
		nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
		nn.fit(transformed)
		knn = nn.kneighbors_graph(mode='connectivity')
		knn = knn.tocoo()
		mknn = knn.minimum(knn.transpose()).tocoo()

		logging.info("Louvain-Jaccard clustering")
		lj = cg.LouvainJaccard(resolution=1)
		labels = lj.fit_predict(knn)

		# Make labels for excluded cells == -1
		labels_all = np.zeros(ds.shape[1], dtype='int') + -1
		labels_all[cells] = labels
		ds.set_attr("Clusters", labels_all, axis=1)
		n_labels = np.max(labels) + 1
		logging.info("Found " + str(n_labels) + " LJ clusters")

		logging.info("Marker selection")
		(genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds)

		# Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA
		cells_adjusted = cg.cap_select(labels, cells, int(n_valid * 0.2))
		n_components = min(50, cells_adjusted.shape[0])
		logging.info("PCA projection to %d components", n_components)
		pca = cg.PCAProjection(genes, max_n_components=n_components)
		pca.fit(ds, normalizer, cells=cells_adjusted)
		# Note that here we're transforming all cells; we just did the fit on the selection
		transformed = pca.transform(ds, normalizer, cells=cells)

		k = min(100, n_valid - 1)
		logging.info("Generating multiscale KNN graph (k = %d)", k)
		nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
		nn.fit(transformed)
		knn = nn.kneighbors(return_distance=False)  # shape: (n_cells, k)
		n_cells = knn.shape[0]
		a = np.tile(np.arange(n_cells), k)
		b = np.reshape(knn.T, (n_cells * k,))
		w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells)
		knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
		threshold = w > 0.05
		mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells))
		mknn = mknn.minimum(mknn.transpose()).tocoo()

		perplexity = min(k, (n_valid - 1) / 3 - 1)
		if self.gtsne:
			logging.info("gt-SNE layout")
			# Note that perplexity argument is ignored in this case, but must still be given
			# because bhtsne will check that it has a valid value
			tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed, knn=knn.tocsr())
		else:
			logging.info("t-SNE layout")
			tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed)
		tsne_all = np.zeros((ds.shape[1], 2), dtype='int') + np.min(tsne_pos, axis=0)
		tsne_all[cells] = tsne_pos

		# Transform back to the full set of cells
		knn = sparse.coo_matrix((knn.data, (cells[knn.row], cells[knn.col])), shape=(n_total, n_total))
		mknn = sparse.coo_matrix((mknn.data, (cells[mknn.row], cells[mknn.col])), shape=(n_total, n_total))

		return (knn, mknn, tsne_all)
コード例 #2
0
    def fit(
        self, ds: loompy.LoomConnection
    ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
        """
		Discover the manifold
		Args:
			n_genes		Number of genes to use for manifold learning (ignored if genes is not None)
			gtsnse		Use graph t-SNE for layout (default: standard tSNE)
			alpha		The scale parameter for multiscale KNN
			genes		List of genes to use for manifold learning

		Returns:
			knn		The multiscale knn graph as a sparse matrix, with k = 100
			mknn	Mutual knn subgraph, with k = 20
			pos		2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2)
		"""
        n_cells = ds.shape[1]
        logging.info("Processing all %d cells", n_cells)
        logging.info("%d of %d genes were valid",
                     np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0])

        logging.info("Normalization")
        normalizer = cg.Normalizer(False)
        normalizer.fit(ds)

        if self.filter_cellcycle is not None:
            cell_cycle_genes = np.array(
                open(self.filter_cellcycle).read().split())
            mask = np.in1d(ds.row_attrs["Gene"], cell_cycle_genes)
            if np.sum(mask) == 0:
                logging.warn(
                    "None cell cycle genes where filtered, check your gene list"
                )
        else:
            mask = None

        if self.genes is None:
            logging.info("Selecting up to %d genes", self.n_genes)
            genes = cg.FeatureSelection(self.n_genes).fit(ds,
                                                          mu=normalizer.mu,
                                                          sd=normalizer.sd,
                                                          mask=mask)

            n_components = min(50, n_cells)
            logging.info("PCA projection to %d components", n_components)
            pca = cg.PCAProjection(genes, max_n_components=n_components)
            pca_transformed = pca.fit_transform(ds, normalizer)
            transformed = pca_transformed

            logging.info("Generating balanced KNN graph")
            k = min(self.k, n_cells - 1)
            bnn = cg.BalancedKNN(k=k, maxl=2 * k)
            bnn.fit(transformed)
            knn = bnn.kneighbors_graph(mode='connectivity')
            knn = knn.tocoo()
            mknn = knn.minimum(knn.transpose()).tocoo()

            logging.info("MKNN-Louvain clustering with outliers")
            (a, b, w) = (mknn.row, mknn.col, mknn.data)
            G = igraph.Graph(list(zip(a, b)),
                             directed=False,
                             edge_attrs={'weight': w})
            VxCl = G.community_multilevel(return_levels=False,
                                          weights="weight")
            labels = np.array(VxCl.membership)
            bigs = np.where(np.bincount(labels) >= 10)[0]
            mapping = {k: v for v, k in enumerate(bigs)}
            labels = np.array(
                [mapping[x] if x in bigs else -1 for x in labels])

            # Make labels for excluded cells == -1
            ds.set_attr("Clusters", labels, axis=1)
            n_labels = np.max(labels) + 1
            logging.info("Found " + str(n_labels) + " clusters")

            logging.info("Marker selection")
            (genes, _,
             _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds)
        else:
            genes = self.genes

        temp = np.zeros(ds.shape[0])
        temp[genes] = 1
        ds.set_attr("_Selected", temp, axis=0)
        logging.info("%d genes selected", temp.sum())

        if self.genes is None:
            # Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA
            cells_adjusted = cg.cap_select(labels - labels.min(),
                                           np.arange(n_cells),
                                           int(n_cells * 0.2))
            n_components = min(50, cells_adjusted.shape[0])
            logging.info("PCA projection to %d components", n_components)
            pca = cg.PCAProjection(genes, max_n_components=n_components)
            pca.fit(ds, normalizer, cells=cells_adjusted)
        else:
            n_components = min(50, n_cells)
            logging.info("PCA projection to %d components", n_components)
            pca = cg.PCAProjection(genes, max_n_components=n_components)
            pca.fit(ds, normalizer)

        # Note that here we're transforming all cells; we just did the fit on the selection
        transformed = pca.transform(ds, normalizer)

        k = min(self.k, n_cells - 1)
        logging.info("Generating multiscale KNN graph (k = %d)", k)
        bnn = cg.BalancedKNN(k=k, maxl=2 * k)
        bnn.fit(transformed)
        knn = bnn.kneighbors(mode='connectivity')[1][:, 1:]
        n_cells = knn.shape[0]
        a = np.tile(np.arange(n_cells), k)
        b = np.reshape(knn.T, (n_cells * k, ))
        w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells)
        knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
        threshold = w > 0.05
        mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])),
                                 shape=(n_cells, n_cells))
        mknn = mknn.minimum(mknn.transpose()).tocoo()

        perplexity = min(k, (n_cells - 1) / 3 - 1)
        if self.gtsne:
            logging.info("gt-SNE layout")
            # Note that perplexity argument is ignored in this case, but must still be given
            # because bhtsne will check that it has a valid value
            tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed,
                                                             knn=knn.tocsr())
        else:
            logging.info("t-SNE layout")
            tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed)

        return (knn, mknn, tsne_pos)