def fit(self, ds: loompy.LoomConnection, plot: str = None) -> np.ndarray: """ Fit a classifier and use it to determine cluster predictive power Args: ds Dataset plot Filename for optional plot Returns: Matrix of classification probabilities, shape (n_cells, n_labels) """ logging.info("Feature selection") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Feature selection") (_, enrichment, _) = cg.MarkerSelection(findq=False, labels_attr="Clusters").fit(ds) genes = np.zeros_like(ds.ra.Gene, dtype=bool) for ix in range(enrichment.shape[1]): genes[np.argsort(-enrichment[:, ix])[:25]] = True logging.info("PCA projection") pca = cg.PCAProjection(genes, max_n_components=50) transformed = pca.fit_transform(ds, normalizer) le = LabelEncoder().fit(ds.ca.ClusterName) self.le = le labels = le.transform(ds.ca.ClusterName) train_X, test_X, train_Y, test_Y = train_test_split(transformed, labels, test_size=0.2) classifier = RandomForestClassifier(max_depth=30) classifier.fit(train_X, train_Y) self.report = classification_report(test_Y, classifier.predict(test_X), target_names=le.classes_) self.proba = classifier.predict_proba(transformed) if plot: agg = npg.aggregate(labels, self.proba, axis=0, func="mean") plt.imshow(agg, cmap="viridis") plt.xticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="vertical", fontsize=7) plt.yticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="horizontal", fontsize=7) plt.xlabel("Predicted cell type") plt.ylabel("Observed cell type") plt.title("Predictive power of cluster identities") cbar = plt.colorbar() cbar.set_label('Average classification probability', rotation=90) plt.savefig(plot, bbox_inches="tight") return self.proba
def fit(self, ds: loompy.LoomConnection) -> None: # Validating genes logging.info("Marking invalid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes with open(os.path.join(self.classified_dir, "genes.txt"), "w") as f: for ix in range(valid_genes.shape[0]): f.write(ds.Accession[ix]) f.write("\t") f.write(str(valid_genes[ix])) f.write("\n") logging.info("Normalization") normalizer = cg.Normalizer(True) normalizer.fit(ds) self.mu = normalizer.mu self.sd = normalizer.sd logging.info("Feature selection") genes = cg.FeatureSelection(2000).fit(ds) logging.info("PCA projection") self.pca = cg.PCAProjection(genes, max_n_components=50) transformed = self.pca.fit_transform(ds, normalizer) self.classes = ds.col_attrs["SubclassAssigned"] self.le = LabelEncoder().fit(self.classes) self.labels = self.le.transform(self.classes) train_X, test_X, train_Y, test_Y = train_test_split(transformed, self.labels, test_size=0.2, random_state=0) self.classifier = SVC(probability=True) self.classifier.fit(train_X, train_Y) with open(os.path.join(self.classified_dir, "performance.txt"), "w") as f: f.write( classification_report(test_Y, self.classifier.predict(test_X), target_names=self.le.classes_))
def predict( self, ds: loompy.LoomConnection, probability: bool = False ) -> Union[List[str], Tuple[List[str], np.ndarray, List[str]]]: logging.info("Normalization") normalizer = cg.Normalizer(True) normalizer.fit(ds) normalizer.mu = self.mu # Use the same row means as were used during training normalizer.sd = self.sd logging.info("PCA projection") transformed = self.pca.transform(ds, normalizer) logging.info("Class prediction") labels = self.classifier.predict(transformed) if probability == False: return self.le.inverse_transform(labels) else: probs = self.classifier.predict_proba(transformed) return (self.le.inverse_transform(labels), probs, self.le.inverse_transform(self.classifier.classes_))
def fit( self, ds: loompy.LoomConnection ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Returns: knn The knn graph as a sparse matrix mknn Mutual knn subgraph pos 2D projection (gt-SNE) as ndarray with shape (n_cells, 2) """ n_cells = ds.shape[1] logging.info("Processing all %d cells", n_cells) logging.info("Validating genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes logging.info("%d of %d genes were valid", np.sum(ds.ra._Valid == 1), ds.shape[0]) logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd) logging.info("Loading data for selected genes") data = np.zeros((n_cells, genes.shape[0])) for (ix, selection, view) in ds.scan(axis=1): data[selection - ix, :] = view[genes, :].T logging.info("Computing initial subspace KNN") subspaces = np.ones(data.shape) knn = subspace_knn_graph(data, subspaces) mknn = knn.minimum(knn.transpose()).tocoo() for t in range(5): logging.info(f"Refining subspace KNN (iteration {t + 1})") logging.info("Louvain clustering") graph = nx.from_scipy_sparse_matrix(mknn) partitions = community.best_partition(graph) labels = np.array( [partitions[key] for key in range(mknn.shape[0])]) ds.ca.Clusters = labels n_labels = np.max(labels) + 1 logging.info(f"Found {n_labels} clusters") logging.info("Marker selection") (_, enrichment, _) = cg.MarkerSelection(n_markers=10, findq=False).fit(ds) subspaces = np.zeros(data.shape) for ix in range(enrichment.shape[1]): for j in range(n_cells): subspaces[j, np.argsort(-enrichment[:, ix])[:self.n_genes // n_labels]] = 1 knn = subspace_knn_graph(data, subspaces) mknn = knn.minimum(knn.transpose()).tocoo() perplexity = min(self.k, (n_cells - 1) / 3 - 1) logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity).layout(data, knn=knn.tocsr()) return (knn, mknn, tsne_pos)
def fit(self, ds: loompy.LoomConnection) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Args: n_genes Number of genes to use for manifold learning (ignored if genes is not None) gtsnse Use graph t-SNE for layout (default: standard tSNE) alpha The scale parameter for multiscale KNN genes List of genes to use for manifold learning Returns: knn The multiscale knn graph as a sparse matrix, with k = 100 mknn Mutual knn subgraph, with k = 20 pos 2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2) """ n_valid = np.sum(ds.col_attrs["_Valid"] == 1) n_total = ds.shape[1] logging.info("%d of %d cells were valid", n_valid, n_total) logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0]) cells = np.where(ds.col_attrs["_Valid"] == 1)[0] logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) if self.filter_cellcycle is not None: cell_cycle_genes = np.array(open(self.filter_cellcycle).read().split()) mask = np.in1d(ds.ra.Gene, cell_cycle_genes) if np.sum(mask) == 0: logging.warn("None cell cycle genes where filtered, check your gene list") else: mask = None if self.genes is None: logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask) temp = np.zeros(ds.shape[0]) temp[genes] = 1 ds.set_attr("_Selected", temp, axis=0) logging.info("%d genes selected", temp.sum()) n_components = min(50, n_valid) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer) pca_transformed = pca.fit_transform(ds, normalizer, cells=cells) transformed = pca_transformed logging.info("Generating KNN graph") k = min(10, n_valid - 1) nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4) nn.fit(transformed) knn = nn.kneighbors_graph(mode='connectivity') knn = knn.tocoo() mknn = knn.minimum(knn.transpose()).tocoo() logging.info("Louvain-Jaccard clustering") lj = cg.LouvainJaccard(resolution=1) labels = lj.fit_predict(knn) # Make labels for excluded cells == -1 labels_all = np.zeros(ds.shape[1], dtype='int') + -1 labels_all[cells] = labels ds.set_attr("Clusters", labels_all, axis=1) n_labels = np.max(labels) + 1 logging.info("Found " + str(n_labels) + " LJ clusters") logging.info("Marker selection") (genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels), mask=mask).fit(ds) else: genes = self.genes temp = np.zeros(ds.shape[0]) temp[genes] = 1 ds.set_attr("_Selected", temp, axis=0) logging.info("%d genes selected", temp.sum()) n_components = min(50, n_valid) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer) pca_transformed = pca.fit_transform(ds, normalizer, cells=cells) transformed = pca_transformed logging.info("Generating KNN graph") k = min(10, n_valid - 1) nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4) nn.fit(transformed) knn = nn.kneighbors_graph(mode='connectivity') knn = knn.tocoo() mknn = knn.minimum(knn.transpose()).tocoo() logging.info("Louvain-Jaccard clustering") lj = cg.LouvainJaccard(resolution=1) labels = lj.fit_predict(knn) # Make labels for excluded cells == -1 labels_all = np.zeros(ds.shape[1], dtype='int') + -1 labels_all[cells] = labels ds.set_attr("Clusters", labels_all, axis=1) n_labels = np.max(labels) + 1 logging.info("Found " + str(n_labels) + " LJ clusters") logging.info("Marker selection") (genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds) # Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA cells_adjusted = cg.cap_select(labels, cells, int(n_valid * 0.2)) n_components = min(50, cells_adjusted.shape[0]) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca.fit(ds, normalizer, cells=cells_adjusted) # Note that here we're transforming all cells; we just did the fit on the selection transformed = pca.transform(ds, normalizer, cells=cells) k = min(100, n_valid - 1) logging.info("Generating multiscale KNN graph (k = %d)", k) nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4) nn.fit(transformed) knn = nn.kneighbors(return_distance=False) # shape: (n_cells, k) n_cells = knn.shape[0] a = np.tile(np.arange(n_cells), k) b = np.reshape(knn.T, (n_cells * k,)) w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells) knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells)) threshold = w > 0.05 mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells)) mknn = mknn.minimum(mknn.transpose()).tocoo() perplexity = min(k, (n_valid - 1) / 3 - 1) if self.gtsne: logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed, knn=knn.tocsr()) else: logging.info("t-SNE layout") tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed) tsne_all = np.zeros((ds.shape[1], 2), dtype='int') + np.min(tsne_pos, axis=0) tsne_all[cells] = tsne_pos # Transform back to the full set of cells knn = sparse.coo_matrix((knn.data, (cells[knn.row], cells[knn.col])), shape=(n_total, n_total)) mknn = sparse.coo_matrix((mknn.data, (cells[mknn.row], cells[mknn.col])), shape=(n_total, n_total)) return (knn, mknn, tsne_all)
def fit(self, ds: loompy.LoomConnection, initial_pos: np.ndarray = None, nng: np.ndarray = None, blocked_genes: np.ndarray = None) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Args: n_genes Number of genes to use for manifold learning (ignored if genes is not None) gtsnse Use graph t-SNE for layout (default: standard tSNE) alpha The scale parameter for multiscale KNN genes List of genes to use for manifold learning initial_pos Use this initial layout, shape (ds.shape[1], 2) nng Non-neuronal genes, set these to zero in neurons (mask array) blocked_gens Don't use these genes (mask array) Returns: knn The multiscale knn graph as a sparse matrix, with k = 100 mknn Mutual knn subgraph, with k = 20 pos 2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2) """ n_cells = ds.shape[1] logging.info("Processing all %d cells", n_cells) logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0]) logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) if self.filter_cellcycle is not None: cell_cycle_genes = np.array(open(self.filter_cellcycle).read().split()) mask = np.in1d(ds.ra.Gene, cell_cycle_genes) if np.sum(mask) == 0: logging.warn("None cell cycle genes where filtered, check your gene list") else: mask = None if blocked_genes is not None: if mask is None: mask = blocked_genes else: mask = mask & blocked_genes if self.genes is None: logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask) n_components = min(50, n_cells) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca_transformed = pca.fit_transform(ds, normalizer) transformed = pca_transformed logging.info("Generating balanced KNN graph") np.random.seed(0) k = min(self.k, n_cells - 1) bnn = cg.BalancedKNN(k=k, maxl=2 * k, sight_k=2 * k) bnn.fit(transformed) knn = bnn.kneighbors_graph(mode='connectivity') knn = knn.tocoo() mknn = knn.minimum(knn.transpose()).tocoo() logging.info("MKNN-Louvain clustering with outliers") (a, b, w) = (mknn.row, mknn.col, mknn.data) random.seed(13) lj = cg.LouvainJaccard(resolution=1, jaccard=False) labels = lj.fit_predict(knn) bigs = np.where(np.bincount(labels) >= 10)[0] mapping = {k: v for v, k in enumerate(bigs)} labels = np.array([mapping[x] if x in bigs else -1 for x in labels]) n_labels = np.max(labels) + 1 logging.info("Found " + str(n_labels) + " clusters") logging.info("Marker selection") temp = None if "Clusters" in ds.ca: temp = ds.ca.Clusters ds.ca.Clusters = labels - labels.min() (genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels), mask=mask, findq=False).fit(ds) if temp is not None: ds.ca.Clusters = temp else: genes = self.genes temp = np.zeros(ds.shape[0], dtype='bool') temp[genes] = True ds.ra._Selected = temp.astype('int') logging.info("%d genes selected", temp.sum()) if self.genes is None: # Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA cells_adjusted = cg.cap_select(labels - labels.min(), np.arange(n_cells), int(n_cells * 0.2)) n_components = min(50, cells_adjusted.shape[0]) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca.fit(ds, normalizer, cells=cells_adjusted) else: n_components = min(50, n_cells) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca.fit(ds, normalizer) # Note that here we're transforming all cells; we just did the fit on the selection transformed = pca.transform(ds, normalizer) k = min(self.k, n_cells - 1) logging.info("Generating multiscale KNN graph (k = %d)", k) bnn = cg.BalancedKNN(k=k, maxl=2 * k, sight_k=2 * k) bnn.fit(transformed) knn = bnn.kneighbors(mode='connectivity')[1][:, 1:] n_cells = knn.shape[0] a = np.tile(np.arange(n_cells), k) b = np.reshape(knn.T, (n_cells * k,)) w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells) knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells)) threshold = w > 0.025 mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells)) mknn = mknn.minimum(mknn.transpose()).tocoo() perplexity = min(k, (n_cells - 1) / 3 - 1) if self.gtsne: logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity, max_iter=self.max_iter).layout(transformed, knn=knn.tocsr(), initial_pos=initial_pos) else: logging.info("t-SNE layout") tsne_pos = cg.TSNE(perplexity=perplexity, max_iter=self.max_iter).layout(transformed, initial_pos=initial_pos) return (knn, mknn, tsne_pos)
def aggregate(self, ds: loompy.LoomConnection, out_file: str, agg_spec: Dict[str, str] = None) -> None: if agg_spec is None: agg_spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "_Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean" } cells = ds.col_attrs["Clusters"] >= 0 labels = ds.col_attrs["Clusters"][cells] n_labels = len(set(labels)) logging.info("Aggregating clusters by mean") cg.aggregate_loom(ds, out_file, None, "Clusters", "mean", agg_spec) with loompy.connect(out_file) as dsout: logging.info("Trinarizing") if type(self.f) is list or type(self.f) is tuple: for ix, f in enumerate(self.f): trinaries = cg.Trinarizer(f=f).fit(ds) if ix == 0: dsout.layers["trinaries"] = trinaries else: dsout.layers[f"trinaries_{f}"] = trinaries else: trinaries = cg.Trinarizer(f=self.f).fit(ds) dsout.layers["trinaries"] = trinaries logging.info("Computing cluster gene enrichment scores") (markers, enrichment, qvals) = cg.MarkerSelection(self.n_markers).fit(ds) dsout.layers["enrichment"] = enrichment dsout.layers["enrichment_q"] = qvals dsout.ca.NCells = np.bincount(labels, minlength=n_labels) # Renumber the clusters logging.info( "Renumbering clusters by similarity, and permuting columns") if "_Selected" in ds.ra: genes = (ds.ra._Selected == 1) else: logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Selecting up to 1000 genes") genes = cg.FeatureSelection(1000).fit(ds, mu=normalizer.mu, sd=normalizer.sd) data = np.log(dsout[:, :] + 1)[genes, :].T D = pdist(data, 'euclidean') Z = hc.linkage(D, 'ward') optimal_Z = optimal_leaf_ordering(Z, D) ordering = hc.leaves_list(optimal_Z) # Permute the aggregated file, and renumber dsout.permute(ordering, axis=1) dsout.ca.Clusters = np.arange(n_labels) # Renumber the original file, and permute d = dict(zip(ordering, np.arange(n_labels))) new_clusters = np.array( [d[x] if x in d else -1 for x in ds.ca.Clusters]) ds.ca.Clusters = new_clusters ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1) # Reorder the genes, markers first, ordered by enrichment in clusters logging.info("Permuting rows") mask = np.zeros(ds.shape[0], dtype=bool) mask[markers] = True # fetch enrichment from the aggregated file, so we get it already permuted on the column axis gene_order = np.zeros(ds.shape[0], dtype='int') gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :], axis=1) gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :], axis=1) + dsout.shape[1] gene_order = np.argsort(gene_order) ds.permute(gene_order, axis=0) dsout.permute(gene_order, axis=0) data = trinaries[:, ordering][gene_order, :][:self.n_markers * n_labels, :].T cluster_scores = [] for ix in range(n_labels): cluster_scores.append(data[ix, ix * 10:(ix + 1) * 10].sum()) dsout.ca.ClusterScore = np.array(cluster_scores)
def fit( self, ds: loompy.LoomConnection ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Args: n_genes Number of genes to use for manifold learning (ignored if genes is not None) gtsnse Use graph t-SNE for layout (default: standard tSNE) alpha The scale parameter for multiscale KNN genes List of genes to use for manifold learning Returns: knn The multiscale knn graph as a sparse matrix, with k = 100 mknn Mutual knn subgraph, with k = 20 pos 2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2) """ n_cells = ds.shape[1] logging.info("Processing all %d cells", n_cells) logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0]) logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) if self.filter_cellcycle is not None: cell_cycle_genes = np.array( open(self.filter_cellcycle).read().split()) mask = np.in1d(ds.row_attrs["Gene"], cell_cycle_genes) if np.sum(mask) == 0: logging.warn( "None cell cycle genes where filtered, check your gene list" ) else: mask = None if self.genes is None: logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask) n_components = min(50, n_cells) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca_transformed = pca.fit_transform(ds, normalizer) transformed = pca_transformed logging.info("Generating balanced KNN graph") k = min(self.k, n_cells - 1) bnn = cg.BalancedKNN(k=k, maxl=2 * k) bnn.fit(transformed) knn = bnn.kneighbors_graph(mode='connectivity') knn = knn.tocoo() mknn = knn.minimum(knn.transpose()).tocoo() logging.info("MKNN-Louvain clustering with outliers") (a, b, w) = (mknn.row, mknn.col, mknn.data) G = igraph.Graph(list(zip(a, b)), directed=False, edge_attrs={'weight': w}) VxCl = G.community_multilevel(return_levels=False, weights="weight") labels = np.array(VxCl.membership) bigs = np.where(np.bincount(labels) >= 10)[0] mapping = {k: v for v, k in enumerate(bigs)} labels = np.array( [mapping[x] if x in bigs else -1 for x in labels]) # Make labels for excluded cells == -1 ds.set_attr("Clusters", labels, axis=1) n_labels = np.max(labels) + 1 logging.info("Found " + str(n_labels) + " clusters") logging.info("Marker selection") (genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds) else: genes = self.genes temp = np.zeros(ds.shape[0]) temp[genes] = 1 ds.set_attr("_Selected", temp, axis=0) logging.info("%d genes selected", temp.sum()) if self.genes is None: # Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA cells_adjusted = cg.cap_select(labels - labels.min(), np.arange(n_cells), int(n_cells * 0.2)) n_components = min(50, cells_adjusted.shape[0]) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca.fit(ds, normalizer, cells=cells_adjusted) else: n_components = min(50, n_cells) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca.fit(ds, normalizer) # Note that here we're transforming all cells; we just did the fit on the selection transformed = pca.transform(ds, normalizer) k = min(self.k, n_cells - 1) logging.info("Generating multiscale KNN graph (k = %d)", k) bnn = cg.BalancedKNN(k=k, maxl=2 * k) bnn.fit(transformed) knn = bnn.kneighbors(mode='connectivity')[1][:, 1:] n_cells = knn.shape[0] a = np.tile(np.arange(n_cells), k) b = np.reshape(knn.T, (n_cells * k, )) w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells) knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells)) threshold = w > 0.05 mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells)) mknn = mknn.minimum(mknn.transpose()).tocoo() perplexity = min(k, (n_cells - 1) / 3 - 1) if self.gtsne: logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed, knn=knn.tocsr()) else: logging.info("t-SNE layout") tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed) return (knn, mknn, tsne_pos)
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") ds = loompy.connect(self.input().fn) spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "_Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean", "Bucket": "mode", "Region": "first", "OriginalClusters": "first", "LeafOrder": "first", "Probable_location": "first", "Developmental_compartment": "first", "Description": "first", "Location_based_on": "first", "Neurotransmitter": "first", "LeafOrder": "first", "Comment": "first", "ClusterName": "first", "TaxonomyRank1": "first", "TaxonomyRank2": "first", "TaxonomyRank3": "first", "TaxonomyRank4": "first", "TaxonomySymbol": "first" } cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec) with loompy.connect(out_file) as dsagg: logging.info( "Finding non-neuronal, housekeeping, and troublemaking genes" ) (nng, blocked) = _gene_selection_L5(dsagg) logging.info("Manifold learning on the aggregate file") normalizer = cg.Normalizer(False) normalizer.fit(dsagg) pca = cg.PCAProjection(np.arange(dsagg.shape[1] * 10), max_n_components=50) pca.fit(dsagg, normalizer) transformed = pca.transform(dsagg, normalizer) k = 40 bnn = cg.BalancedKNN(k=k, maxl=2 * k) bnn.fit(transformed) knn = bnn.kneighbors(mode='connectivity')[1][:, 1:] n_cells = knn.shape[0] a = np.tile(np.arange(n_cells), k) b = np.reshape(knn.T, (n_cells * k, )) w = np.repeat(1 / np.power(np.arange(1, k + 1), 1.8), n_cells) knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells)) threshold = w > 0.025 mknn = sparse.coo_matrix( (w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells)) mknn = mknn.minimum(mknn.transpose()).tocoo() tsne = cg.TSNE(perplexity=5).layout(transformed) dsagg.col_graphs.KNN = knn dsagg.col_graphs.MKNN = mknn dsagg.ca._X = tsne[:, 0] dsagg.ca._Y = tsne[:, 1] logging.info("Manifold learning on all cells") init = np.zeros((ds.shape[1], 2)) for lbl in np.unique(ds.ca.Clusters): init[ds.ca.Clusters == lbl, :] = tsne[lbl, :] + np.random.normal(size=( (ds.ca.Clusters == lbl).sum(), 2)) ml = cg.ManifoldLearning2(gtsne=True, alpha=1, max_iter=3000) (knn, mknn, tsne) = ml.fit(ds, initial_pos=init, nng=nng, blocked_genes=blocked) ds.col_graphs.KNN = knn ds.col_graphs.MKNN = mknn ds.ca._X = tsne[:, 0] ds.ca._Y = tsne[:, 1] logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root="../auto-annotation/Adolescent/") aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator(n_genes=6).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1)