Esempio n. 1
0
	def fit(self, ds: loompy.LoomConnection, plot: str = None) -> np.ndarray:
		"""
		Fit a classifier and use it to determine cluster predictive power

		Args:
			ds		Dataset
			plot	Filename for optional plot

		Returns:
			Matrix of classification probabilities, shape (n_cells, n_labels)
		"""
		logging.info("Feature selection")
		nnz = ds.map([np.count_nonzero], axis=0)[0]
		valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int")
		ds.ra._Valid = valid_genes

		logging.info("Normalization")
		normalizer = cg.Normalizer(False)
		normalizer.fit(ds)

		logging.info("Feature selection")
		(_, enrichment, _) = cg.MarkerSelection(findq=False, labels_attr="Clusters").fit(ds)
		genes = np.zeros_like(ds.ra.Gene, dtype=bool)
		for ix in range(enrichment.shape[1]):
			genes[np.argsort(-enrichment[:, ix])[:25]] = True

		logging.info("PCA projection")
		pca = cg.PCAProjection(genes, max_n_components=50)
		transformed = pca.fit_transform(ds, normalizer)

		le = LabelEncoder().fit(ds.ca.ClusterName)
		self.le = le
		labels = le.transform(ds.ca.ClusterName)

		train_X, test_X, train_Y, test_Y = train_test_split(transformed, labels, test_size=0.2)
		classifier = RandomForestClassifier(max_depth=30)
		classifier.fit(train_X, train_Y)
		self.report = classification_report(test_Y, classifier.predict(test_X), target_names=le.classes_)
		self.proba = classifier.predict_proba(transformed)

		if plot:
			agg = npg.aggregate(labels, self.proba, axis=0, func="mean")
			plt.imshow(agg, cmap="viridis")
			plt.xticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="vertical", fontsize=7)
			plt.yticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="horizontal", fontsize=7)
			plt.xlabel("Predicted cell type")
			plt.ylabel("Observed cell type")
			plt.title("Predictive power of cluster identities")
			cbar = plt.colorbar()
			cbar.set_label('Average classification probability', rotation=90)
			plt.savefig(plot, bbox_inches="tight")

		return self.proba
Esempio n. 2
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        # Validating genes
        logging.info("Marking invalid genes")
        nnz = ds.map([np.count_nonzero], axis=0)[0]
        valid_genes = np.logical_and(nnz > 5,
                                     nnz < ds.shape[1] * 0.5).astype("int")
        ds.ra._Valid = valid_genes
        with open(os.path.join(self.classified_dir, "genes.txt"), "w") as f:
            for ix in range(valid_genes.shape[0]):
                f.write(ds.Accession[ix])
                f.write("\t")
                f.write(str(valid_genes[ix]))
                f.write("\n")

        logging.info("Normalization")
        normalizer = cg.Normalizer(True)
        normalizer.fit(ds)
        self.mu = normalizer.mu
        self.sd = normalizer.sd

        logging.info("Feature selection")
        genes = cg.FeatureSelection(2000).fit(ds)

        logging.info("PCA projection")
        self.pca = cg.PCAProjection(genes, max_n_components=50)
        transformed = self.pca.fit_transform(ds, normalizer)

        self.classes = ds.col_attrs["SubclassAssigned"]
        self.le = LabelEncoder().fit(self.classes)
        self.labels = self.le.transform(self.classes)

        train_X, test_X, train_Y, test_Y = train_test_split(transformed,
                                                            self.labels,
                                                            test_size=0.2,
                                                            random_state=0)
        self.classifier = SVC(probability=True)
        self.classifier.fit(train_X, train_Y)
        with open(os.path.join(self.classified_dir, "performance.txt"),
                  "w") as f:
            f.write(
                classification_report(test_Y,
                                      self.classifier.predict(test_X),
                                      target_names=self.le.classes_))
Esempio n. 3
0
    def predict(
        self,
        ds: loompy.LoomConnection,
        probability: bool = False
    ) -> Union[List[str], Tuple[List[str], np.ndarray, List[str]]]:
        logging.info("Normalization")
        normalizer = cg.Normalizer(True)
        normalizer.fit(ds)
        normalizer.mu = self.mu  # Use the same row means as were used during training
        normalizer.sd = self.sd

        logging.info("PCA projection")
        transformed = self.pca.transform(ds, normalizer)

        logging.info("Class prediction")
        labels = self.classifier.predict(transformed)
        if probability == False:
            return self.le.inverse_transform(labels)
        else:
            probs = self.classifier.predict_proba(transformed)
            return (self.le.inverse_transform(labels), probs,
                    self.le.inverse_transform(self.classifier.classes_))
Esempio n. 4
0
    def fit(
        self, ds: loompy.LoomConnection
    ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
        """
		Discover the manifold

		Returns:
			knn		The knn graph as a sparse matrix
			mknn	Mutual knn subgraph
			pos		2D projection (gt-SNE) as ndarray with shape (n_cells, 2)
		"""
        n_cells = ds.shape[1]
        logging.info("Processing all %d cells", n_cells)
        logging.info("Validating genes")
        nnz = ds.map([np.count_nonzero], axis=0)[0]
        valid_genes = np.logical_and(nnz > 5,
                                     nnz < ds.shape[1] * 0.5).astype("int")
        ds.ra._Valid = valid_genes
        logging.info("%d of %d genes were valid", np.sum(ds.ra._Valid == 1),
                     ds.shape[0])

        logging.info("Normalization")
        normalizer = cg.Normalizer(False)
        normalizer.fit(ds)
        logging.info("Selecting up to %d genes", self.n_genes)
        genes = cg.FeatureSelection(self.n_genes).fit(ds,
                                                      mu=normalizer.mu,
                                                      sd=normalizer.sd)

        logging.info("Loading data for selected genes")
        data = np.zeros((n_cells, genes.shape[0]))
        for (ix, selection, view) in ds.scan(axis=1):
            data[selection - ix, :] = view[genes, :].T

        logging.info("Computing initial subspace KNN")
        subspaces = np.ones(data.shape)
        knn = subspace_knn_graph(data, subspaces)
        mknn = knn.minimum(knn.transpose()).tocoo()

        for t in range(5):
            logging.info(f"Refining subspace KNN (iteration {t + 1})")

            logging.info("Louvain clustering")
            graph = nx.from_scipy_sparse_matrix(mknn)
            partitions = community.best_partition(graph)
            labels = np.array(
                [partitions[key] for key in range(mknn.shape[0])])
            ds.ca.Clusters = labels
            n_labels = np.max(labels) + 1
            logging.info(f"Found {n_labels} clusters")

            logging.info("Marker selection")
            (_, enrichment, _) = cg.MarkerSelection(n_markers=10,
                                                    findq=False).fit(ds)
            subspaces = np.zeros(data.shape)
            for ix in range(enrichment.shape[1]):
                for j in range(n_cells):
                    subspaces[j,
                              np.argsort(-enrichment[:, ix])[:self.n_genes //
                                                             n_labels]] = 1
            knn = subspace_knn_graph(data, subspaces)
            mknn = knn.minimum(knn.transpose()).tocoo()

        perplexity = min(self.k, (n_cells - 1) / 3 - 1)
        logging.info("gt-SNE layout")
        # Note that perplexity argument is ignored in this case, but must still be given
        # because bhtsne will check that it has a valid value
        tsne_pos = cg.TSNE(perplexity=perplexity).layout(data, knn=knn.tocsr())

        return (knn, mknn, tsne_pos)
Esempio n. 5
0
	def fit(self, ds: loompy.LoomConnection) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
		"""
		Discover the manifold
		Args:
			n_genes		Number of genes to use for manifold learning (ignored if genes is not None)
			gtsnse		Use graph t-SNE for layout (default: standard tSNE)
			alpha		The scale parameter for multiscale KNN
			genes		List of genes to use for manifold learning

		Returns:
			knn		The multiscale knn graph as a sparse matrix, with k = 100
			mknn	Mutual knn subgraph, with k = 20
			pos		2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2)
		"""
		n_valid = np.sum(ds.col_attrs["_Valid"] == 1)
		n_total = ds.shape[1]
		logging.info("%d of %d cells were valid", n_valid, n_total)
		logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0])
		cells = np.where(ds.col_attrs["_Valid"] == 1)[0]

		logging.info("Normalization")
		normalizer = cg.Normalizer(False)
		normalizer.fit(ds)

		if self.filter_cellcycle is not None:
			cell_cycle_genes = np.array(open(self.filter_cellcycle).read().split())
			mask = np.in1d(ds.ra.Gene, cell_cycle_genes)
			if np.sum(mask) == 0:
				logging.warn("None cell cycle genes where filtered, check your gene list")
		else:
			mask = None

		if self.genes is None:
			logging.info("Selecting up to %d genes", self.n_genes)
			genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask)
			temp = np.zeros(ds.shape[0])
			temp[genes] = 1
			ds.set_attr("_Selected", temp, axis=0)
			logging.info("%d genes selected", temp.sum())

			n_components = min(50, n_valid)
			logging.info("PCA projection to %d components", n_components)
			pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer)
			pca_transformed = pca.fit_transform(ds, normalizer, cells=cells)
			transformed = pca_transformed

			logging.info("Generating KNN graph")
			k = min(10, n_valid - 1)
			nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
			nn.fit(transformed)
			knn = nn.kneighbors_graph(mode='connectivity')
			knn = knn.tocoo()
			mknn = knn.minimum(knn.transpose()).tocoo()

			logging.info("Louvain-Jaccard clustering")
			lj = cg.LouvainJaccard(resolution=1)
			labels = lj.fit_predict(knn)

			# Make labels for excluded cells == -1
			labels_all = np.zeros(ds.shape[1], dtype='int') + -1
			labels_all[cells] = labels
			ds.set_attr("Clusters", labels_all, axis=1)
			n_labels = np.max(labels) + 1
			logging.info("Found " + str(n_labels) + " LJ clusters")

			logging.info("Marker selection")
			(genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels), mask=mask).fit(ds)
		else:
			genes = self.genes

		temp = np.zeros(ds.shape[0])
		temp[genes] = 1
		ds.set_attr("_Selected", temp, axis=0)
		logging.info("%d genes selected", temp.sum())

		n_components = min(50, n_valid)
		logging.info("PCA projection to %d components", n_components)
		pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer)
		pca_transformed = pca.fit_transform(ds, normalizer, cells=cells)
		transformed = pca_transformed

		logging.info("Generating KNN graph")
		k = min(10, n_valid - 1)
		nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
		nn.fit(transformed)
		knn = nn.kneighbors_graph(mode='connectivity')
		knn = knn.tocoo()
		mknn = knn.minimum(knn.transpose()).tocoo()

		logging.info("Louvain-Jaccard clustering")
		lj = cg.LouvainJaccard(resolution=1)
		labels = lj.fit_predict(knn)

		# Make labels for excluded cells == -1
		labels_all = np.zeros(ds.shape[1], dtype='int') + -1
		labels_all[cells] = labels
		ds.set_attr("Clusters", labels_all, axis=1)
		n_labels = np.max(labels) + 1
		logging.info("Found " + str(n_labels) + " LJ clusters")

		logging.info("Marker selection")
		(genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds)

		# Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA
		cells_adjusted = cg.cap_select(labels, cells, int(n_valid * 0.2))
		n_components = min(50, cells_adjusted.shape[0])
		logging.info("PCA projection to %d components", n_components)
		pca = cg.PCAProjection(genes, max_n_components=n_components)
		pca.fit(ds, normalizer, cells=cells_adjusted)
		# Note that here we're transforming all cells; we just did the fit on the selection
		transformed = pca.transform(ds, normalizer, cells=cells)

		k = min(100, n_valid - 1)
		logging.info("Generating multiscale KNN graph (k = %d)", k)
		nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
		nn.fit(transformed)
		knn = nn.kneighbors(return_distance=False)  # shape: (n_cells, k)
		n_cells = knn.shape[0]
		a = np.tile(np.arange(n_cells), k)
		b = np.reshape(knn.T, (n_cells * k,))
		w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells)
		knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
		threshold = w > 0.05
		mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells))
		mknn = mknn.minimum(mknn.transpose()).tocoo()

		perplexity = min(k, (n_valid - 1) / 3 - 1)
		if self.gtsne:
			logging.info("gt-SNE layout")
			# Note that perplexity argument is ignored in this case, but must still be given
			# because bhtsne will check that it has a valid value
			tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed, knn=knn.tocsr())
		else:
			logging.info("t-SNE layout")
			tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed)
		tsne_all = np.zeros((ds.shape[1], 2), dtype='int') + np.min(tsne_pos, axis=0)
		tsne_all[cells] = tsne_pos

		# Transform back to the full set of cells
		knn = sparse.coo_matrix((knn.data, (cells[knn.row], cells[knn.col])), shape=(n_total, n_total))
		mknn = sparse.coo_matrix((mknn.data, (cells[mknn.row], cells[mknn.col])), shape=(n_total, n_total))

		return (knn, mknn, tsne_all)
Esempio n. 6
0
	def fit(self, ds: loompy.LoomConnection, initial_pos: np.ndarray = None, nng: np.ndarray = None, blocked_genes: np.ndarray = None) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
		"""
		Discover the manifold
		Args:
			n_genes			Number of genes to use for manifold learning (ignored if genes is not None)
			gtsnse			Use graph t-SNE for layout (default: standard tSNE)
			alpha			The scale parameter for multiscale KNN
			genes			List of genes to use for manifold learning
			initial_pos		Use this initial layout, shape (ds.shape[1], 2)
			nng				Non-neuronal genes, set these to zero in neurons (mask array)
			blocked_gens	Don't use these genes (mask array)

		Returns:
			knn		The multiscale knn graph as a sparse matrix, with k = 100
			mknn	Mutual knn subgraph, with k = 20
			pos		2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2)
		"""
		n_cells = ds.shape[1]
		logging.info("Processing all %d cells", n_cells)
		logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0])

		logging.info("Normalization")
		normalizer = cg.Normalizer(False)
		normalizer.fit(ds)

		if self.filter_cellcycle is not None:
			cell_cycle_genes = np.array(open(self.filter_cellcycle).read().split())
			mask = np.in1d(ds.ra.Gene, cell_cycle_genes)
			if np.sum(mask) == 0:
				logging.warn("None cell cycle genes where filtered, check your gene list")
		else:
			mask = None
		
		if blocked_genes is not None:
			if mask is None:
				mask = blocked_genes
			else:
				mask = mask & blocked_genes

		if self.genes is None:
			logging.info("Selecting up to %d genes", self.n_genes)
			genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask)

			n_components = min(50, n_cells)
			logging.info("PCA projection to %d components", n_components)
			pca = cg.PCAProjection(genes, max_n_components=n_components)
			pca_transformed = pca.fit_transform(ds, normalizer)
			transformed = pca_transformed

			logging.info("Generating balanced KNN graph")
			np.random.seed(0)
			k = min(self.k, n_cells - 1)
			bnn = cg.BalancedKNN(k=k, maxl=2 * k, sight_k=2 * k)
			bnn.fit(transformed)
			knn = bnn.kneighbors_graph(mode='connectivity')
			knn = knn.tocoo()
			mknn = knn.minimum(knn.transpose()).tocoo()

			logging.info("MKNN-Louvain clustering with outliers")
			(a, b, w) = (mknn.row, mknn.col, mknn.data)
			random.seed(13)
			lj = cg.LouvainJaccard(resolution=1, jaccard=False)
			labels = lj.fit_predict(knn)
			bigs = np.where(np.bincount(labels) >= 10)[0]
			mapping = {k: v for v, k in enumerate(bigs)}
			labels = np.array([mapping[x] if x in bigs else -1 for x in labels])

			n_labels = np.max(labels) + 1
			logging.info("Found " + str(n_labels) + " clusters")

			logging.info("Marker selection")
			temp = None
			if "Clusters" in ds.ca:
				temp = ds.ca.Clusters
			ds.ca.Clusters = labels - labels.min()
			(genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels), mask=mask, findq=False).fit(ds)
			if temp is not None:
				ds.ca.Clusters = temp
		else:
			genes = self.genes

		temp = np.zeros(ds.shape[0], dtype='bool')
		temp[genes] = True
		ds.ra._Selected = temp.astype('int')
		logging.info("%d genes selected", temp.sum())

		if self.genes is None:
			# Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA
			cells_adjusted = cg.cap_select(labels - labels.min(), np.arange(n_cells), int(n_cells * 0.2))
			n_components = min(50, cells_adjusted.shape[0])
			logging.info("PCA projection to %d components", n_components)
			pca = cg.PCAProjection(genes, max_n_components=n_components)
			pca.fit(ds, normalizer, cells=cells_adjusted)
		else:
			n_components = min(50, n_cells)
			logging.info("PCA projection to %d components", n_components)
			pca = cg.PCAProjection(genes, max_n_components=n_components)
			pca.fit(ds, normalizer)
			
		# Note that here we're transforming all cells; we just did the fit on the selection
		transformed = pca.transform(ds, normalizer)

		k = min(self.k, n_cells - 1)
		logging.info("Generating multiscale KNN graph (k = %d)", k)
		bnn = cg.BalancedKNN(k=k, maxl=2 * k, sight_k=2 * k)
		bnn.fit(transformed)
		knn = bnn.kneighbors(mode='connectivity')[1][:, 1:]
		n_cells = knn.shape[0]
		a = np.tile(np.arange(n_cells), k)
		b = np.reshape(knn.T, (n_cells * k,))
		w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells)
		knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
		threshold = w > 0.025
		mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells))
		mknn = mknn.minimum(mknn.transpose()).tocoo()

		perplexity = min(k, (n_cells - 1) / 3 - 1)
		if self.gtsne:
			logging.info("gt-SNE layout")
			# Note that perplexity argument is ignored in this case, but must still be given
			# because bhtsne will check that it has a valid value
			tsne_pos = cg.TSNE(perplexity=perplexity, max_iter=self.max_iter).layout(transformed, knn=knn.tocsr(), initial_pos=initial_pos)
		else:
			logging.info("t-SNE layout")
			tsne_pos = cg.TSNE(perplexity=perplexity, max_iter=self.max_iter).layout(transformed, initial_pos=initial_pos)

		return (knn, mknn, tsne_pos)
Esempio n. 7
0
    def aggregate(self,
                  ds: loompy.LoomConnection,
                  out_file: str,
                  agg_spec: Dict[str, str] = None) -> None:
        if agg_spec is None:
            agg_spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "_Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean"
            }
        cells = ds.col_attrs["Clusters"] >= 0
        labels = ds.col_attrs["Clusters"][cells]
        n_labels = len(set(labels))

        logging.info("Aggregating clusters by mean")
        cg.aggregate_loom(ds, out_file, None, "Clusters", "mean", agg_spec)
        with loompy.connect(out_file) as dsout:
            logging.info("Trinarizing")
            if type(self.f) is list or type(self.f) is tuple:
                for ix, f in enumerate(self.f):
                    trinaries = cg.Trinarizer(f=f).fit(ds)
                    if ix == 0:
                        dsout.layers["trinaries"] = trinaries
                    else:
                        dsout.layers[f"trinaries_{f}"] = trinaries
            else:
                trinaries = cg.Trinarizer(f=self.f).fit(ds)
                dsout.layers["trinaries"] = trinaries

            logging.info("Computing cluster gene enrichment scores")
            (markers, enrichment,
             qvals) = cg.MarkerSelection(self.n_markers).fit(ds)
            dsout.layers["enrichment"] = enrichment
            dsout.layers["enrichment_q"] = qvals

            dsout.ca.NCells = np.bincount(labels, minlength=n_labels)

            # Renumber the clusters
            logging.info(
                "Renumbering clusters by similarity, and permuting columns")
            if "_Selected" in ds.ra:
                genes = (ds.ra._Selected == 1)
            else:
                logging.info("Normalization")
                normalizer = cg.Normalizer(False)
                normalizer.fit(ds)
                logging.info("Selecting up to 1000 genes")
                genes = cg.FeatureSelection(1000).fit(ds,
                                                      mu=normalizer.mu,
                                                      sd=normalizer.sd)

            data = np.log(dsout[:, :] + 1)[genes, :].T
            D = pdist(data, 'euclidean')
            Z = hc.linkage(D, 'ward')
            optimal_Z = optimal_leaf_ordering(Z, D)
            ordering = hc.leaves_list(optimal_Z)

            # Permute the aggregated file, and renumber
            dsout.permute(ordering, axis=1)
            dsout.ca.Clusters = np.arange(n_labels)

            # Renumber the original file, and permute
            d = dict(zip(ordering, np.arange(n_labels)))
            new_clusters = np.array(
                [d[x] if x in d else -1 for x in ds.ca.Clusters])
            ds.ca.Clusters = new_clusters
            ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1)

            # Reorder the genes, markers first, ordered by enrichment in clusters
            logging.info("Permuting rows")
            mask = np.zeros(ds.shape[0], dtype=bool)
            mask[markers] = True
            # fetch enrichment from the aggregated file, so we get it already permuted on the column axis
            gene_order = np.zeros(ds.shape[0], dtype='int')
            gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :],
                                         axis=1)
            gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :],
                                          axis=1) + dsout.shape[1]
            gene_order = np.argsort(gene_order)
            ds.permute(gene_order, axis=0)
            dsout.permute(gene_order, axis=0)

            data = trinaries[:, ordering][gene_order, :][:self.n_markers *
                                                         n_labels, :].T
            cluster_scores = []
            for ix in range(n_labels):
                cluster_scores.append(data[ix, ix * 10:(ix + 1) * 10].sum())
            dsout.ca.ClusterScore = np.array(cluster_scores)
Esempio n. 8
0
    def fit(
        self, ds: loompy.LoomConnection
    ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
        """
		Discover the manifold
		Args:
			n_genes		Number of genes to use for manifold learning (ignored if genes is not None)
			gtsnse		Use graph t-SNE for layout (default: standard tSNE)
			alpha		The scale parameter for multiscale KNN
			genes		List of genes to use for manifold learning

		Returns:
			knn		The multiscale knn graph as a sparse matrix, with k = 100
			mknn	Mutual knn subgraph, with k = 20
			pos		2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2)
		"""
        n_cells = ds.shape[1]
        logging.info("Processing all %d cells", n_cells)
        logging.info("%d of %d genes were valid",
                     np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0])

        logging.info("Normalization")
        normalizer = cg.Normalizer(False)
        normalizer.fit(ds)

        if self.filter_cellcycle is not None:
            cell_cycle_genes = np.array(
                open(self.filter_cellcycle).read().split())
            mask = np.in1d(ds.row_attrs["Gene"], cell_cycle_genes)
            if np.sum(mask) == 0:
                logging.warn(
                    "None cell cycle genes where filtered, check your gene list"
                )
        else:
            mask = None

        if self.genes is None:
            logging.info("Selecting up to %d genes", self.n_genes)
            genes = cg.FeatureSelection(self.n_genes).fit(ds,
                                                          mu=normalizer.mu,
                                                          sd=normalizer.sd,
                                                          mask=mask)

            n_components = min(50, n_cells)
            logging.info("PCA projection to %d components", n_components)
            pca = cg.PCAProjection(genes, max_n_components=n_components)
            pca_transformed = pca.fit_transform(ds, normalizer)
            transformed = pca_transformed

            logging.info("Generating balanced KNN graph")
            k = min(self.k, n_cells - 1)
            bnn = cg.BalancedKNN(k=k, maxl=2 * k)
            bnn.fit(transformed)
            knn = bnn.kneighbors_graph(mode='connectivity')
            knn = knn.tocoo()
            mknn = knn.minimum(knn.transpose()).tocoo()

            logging.info("MKNN-Louvain clustering with outliers")
            (a, b, w) = (mknn.row, mknn.col, mknn.data)
            G = igraph.Graph(list(zip(a, b)),
                             directed=False,
                             edge_attrs={'weight': w})
            VxCl = G.community_multilevel(return_levels=False,
                                          weights="weight")
            labels = np.array(VxCl.membership)
            bigs = np.where(np.bincount(labels) >= 10)[0]
            mapping = {k: v for v, k in enumerate(bigs)}
            labels = np.array(
                [mapping[x] if x in bigs else -1 for x in labels])

            # Make labels for excluded cells == -1
            ds.set_attr("Clusters", labels, axis=1)
            n_labels = np.max(labels) + 1
            logging.info("Found " + str(n_labels) + " clusters")

            logging.info("Marker selection")
            (genes, _,
             _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds)
        else:
            genes = self.genes

        temp = np.zeros(ds.shape[0])
        temp[genes] = 1
        ds.set_attr("_Selected", temp, axis=0)
        logging.info("%d genes selected", temp.sum())

        if self.genes is None:
            # Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA
            cells_adjusted = cg.cap_select(labels - labels.min(),
                                           np.arange(n_cells),
                                           int(n_cells * 0.2))
            n_components = min(50, cells_adjusted.shape[0])
            logging.info("PCA projection to %d components", n_components)
            pca = cg.PCAProjection(genes, max_n_components=n_components)
            pca.fit(ds, normalizer, cells=cells_adjusted)
        else:
            n_components = min(50, n_cells)
            logging.info("PCA projection to %d components", n_components)
            pca = cg.PCAProjection(genes, max_n_components=n_components)
            pca.fit(ds, normalizer)

        # Note that here we're transforming all cells; we just did the fit on the selection
        transformed = pca.transform(ds, normalizer)

        k = min(self.k, n_cells - 1)
        logging.info("Generating multiscale KNN graph (k = %d)", k)
        bnn = cg.BalancedKNN(k=k, maxl=2 * k)
        bnn.fit(transformed)
        knn = bnn.kneighbors(mode='connectivity')[1][:, 1:]
        n_cells = knn.shape[0]
        a = np.tile(np.arange(n_cells), k)
        b = np.reshape(knn.T, (n_cells * k, ))
        w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells)
        knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
        threshold = w > 0.05
        mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])),
                                 shape=(n_cells, n_cells))
        mknn = mknn.minimum(mknn.transpose()).tocoo()

        perplexity = min(k, (n_cells - 1) / 3 - 1)
        if self.gtsne:
            logging.info("gt-SNE layout")
            # Note that perplexity argument is ignored in this case, but must still be given
            # because bhtsne will check that it has a valid value
            tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed,
                                                             knn=knn.tocsr())
        else:
            logging.info("t-SNE layout")
            tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed)

        return (knn, mknn, tsne_pos)
Esempio n. 9
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            ds = loompy.connect(self.input().fn)
            spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "_Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean",
                "Bucket": "mode",
                "Region": "first",
                "OriginalClusters": "first",
                "LeafOrder": "first",
                "Probable_location": "first",
                "Developmental_compartment": "first",
                "Description": "first",
                "Location_based_on": "first",
                "Neurotransmitter": "first",
                "LeafOrder": "first",
                "Comment": "first",
                "ClusterName": "first",
                "TaxonomyRank1": "first",
                "TaxonomyRank2": "first",
                "TaxonomyRank3": "first",
                "TaxonomyRank4": "first",
                "TaxonomySymbol": "first"
            }
            cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec)

            with loompy.connect(out_file) as dsagg:
                logging.info(
                    "Finding non-neuronal, housekeeping, and troublemaking genes"
                )
                (nng, blocked) = _gene_selection_L5(dsagg)

                logging.info("Manifold learning on the aggregate file")
                normalizer = cg.Normalizer(False)
                normalizer.fit(dsagg)
                pca = cg.PCAProjection(np.arange(dsagg.shape[1] * 10),
                                       max_n_components=50)
                pca.fit(dsagg, normalizer)
                transformed = pca.transform(dsagg, normalizer)
                k = 40
                bnn = cg.BalancedKNN(k=k, maxl=2 * k)
                bnn.fit(transformed)
                knn = bnn.kneighbors(mode='connectivity')[1][:, 1:]
                n_cells = knn.shape[0]
                a = np.tile(np.arange(n_cells), k)
                b = np.reshape(knn.T, (n_cells * k, ))
                w = np.repeat(1 / np.power(np.arange(1, k + 1), 1.8), n_cells)
                knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
                threshold = w > 0.025
                mknn = sparse.coo_matrix(
                    (w[threshold], (a[threshold], b[threshold])),
                    shape=(n_cells, n_cells))
                mknn = mknn.minimum(mknn.transpose()).tocoo()
                tsne = cg.TSNE(perplexity=5).layout(transformed)
                dsagg.col_graphs.KNN = knn
                dsagg.col_graphs.MKNN = mknn
                dsagg.ca._X = tsne[:, 0]
                dsagg.ca._Y = tsne[:, 1]

                logging.info("Manifold learning on all cells")
                init = np.zeros((ds.shape[1], 2))
                for lbl in np.unique(ds.ca.Clusters):
                    init[ds.ca.Clusters ==
                         lbl, :] = tsne[lbl, :] + np.random.normal(size=(
                             (ds.ca.Clusters == lbl).sum(), 2))
                ml = cg.ManifoldLearning2(gtsne=True, alpha=1, max_iter=3000)
                (knn, mknn, tsne) = ml.fit(ds,
                                           initial_pos=init,
                                           nng=nng,
                                           blocked_genes=blocked)
                ds.col_graphs.KNN = knn
                ds.col_graphs.MKNN = mknn
                ds.ca._X = tsne[:, 0]
                ds.ca._Y = tsne[:, 1]

                logging.info("Computing auto-annotation")
                aa = cg.AutoAnnotator(root="../auto-annotation/Adolescent/")
                aa.annotate_loom(dsagg)
                aa.save_in_loom(dsagg)

                logging.info("Computing auto-auto-annotation")
                n_clusters = dsagg.shape[1]
                (selected, selectivity, specificity,
                 robustness) = cg.AutoAutoAnnotator(n_genes=6).fit(dsagg)
                dsagg.set_attr("MarkerGenes",
                               np.array([
                                   " ".join(ds.ra.Gene[selected[:, ix]])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                np.set_printoptions(precision=1, suppress=True)
                dsagg.set_attr("MarkerSelectivity",
                               np.array([
                                   str(selectivity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerSpecificity",
                               np.array([
                                   str(specificity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerRobustness",
                               np.array([
                                   str(robustness[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)