def fit(self, ds: loompy.LoomConnection, normalizer: cg.Normalizer, cells: np.ndarray = None) -> None: if cells is None: cells = np.fromiter(range(ds.shape[1]), dtype='int') n_cells = cells.shape[0] n_genes = self.genes.shape[0] # Support out-of-order datasets if "Accession" in ds.row_attrs: self.accessions = ds.row_attrs["Accession"] self.pca = IncrementalPCA(n_components=self.n_components) if self.layer is not None: # NOTE TO AVOID a BUG with layer of pickled objects try: for (ix, selection, view) in ds.scan(items=cells, axis=1, layers=[self.layer]): vals = normalizer.transform(view.layers[self.layer][:, :], selection) if self.nng is not None: vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0 self.pca.partial_fit(vals[self.genes, :].transpose()) # PCA on the selected genes except AttributeError: self.layer = None if self.layer is None: for (ix, selection, view) in ds.scan(items=cells, axis=1): vals = normalizer.transform(view[:, :], selection) if self.nng is not None: vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0 self.pca.partial_fit(vals[self.genes, :].transpose()) # PCA on the selected genes
def fit_transform(self, ds: loompy.LoomConnection) -> None: # Poisson pooling self.fit(ds) knn = self.knn.astype("bool") logging.debug(f"Poisson pooling") ds["pooled"] = 'int32' if self.compute_velocity and "spliced" in ds.layers: ds["spliced_pooled"] = 'int32' ds["unspliced_pooled"] = 'int32' for (_, indexes, view) in ds.scan(axis=0, layers=["spliced", "unspliced"], what=["layers"]): ds["spliced_pooled"][ indexes.min():indexes.max() + 1, :] = view.layers["spliced"][:, :] @ knn.T ds["unspliced_pooled"][ indexes.min():indexes.max() + 1, :] = view.layers["unspliced"][:, :] @ knn.T ds["pooled"][indexes.min():indexes.max() + 1, :] = ds["spliced_pooled"][indexes.min( ):indexes.max() + 1, :] + ds["unspliced_pooled"][ indexes.min():indexes.max() + 1, :] else: for (_, indexes, view) in ds.scan(axis=0, layers=[""], what=["layers"]): ds["pooled"][indexes.min():indexes.max() + 1, :] = view[:, :] @ knn.T
def fit(self, ds: loompy.LoomConnection, mu: np.ndarray = None, sd: np.ndarray = None, totals: np.ndarray = None) -> None: self.sd = sd self.mu = mu self.totals = totals if mu is None or sd is None: (self.sd, self.mu) = ds.map([np.std, np.mean], axis=0) if totals is None: self.totals = ds.map([np.sum], chunksize=100, axis=1)[0]
def _fit(self, ds: loompy.LoomConnection, labels: np.ndarray) -> np.ndarray: logging.info("Computing enrichment statistic") n_labels = len(np.unique(labels)) n_genes, n_cells = ds.shape # Number of cells per cluster sizes = np.bincount(labels, minlength=n_labels) # Number of nonzero values per cluster nnz = ds.aggregate(None, None, labels, np.count_nonzero, None) # Mean value per cluster means = ds.aggregate(None, None, labels, "mean", None) # Non-zeros and means over all cells (nnz_overall, means_overall) = ds.map([np.count_nonzero, np.mean], axis=0) # Scale by number of cells f_nnz = nnz / sizes f_nnz_overall = nnz_overall / n_cells # Means and fraction non-zero values in other clusters (per cluster) means_other = ((means_overall * n_cells)[None].T - (means * sizes)) / (n_cells - sizes) f_nnz_other = ((f_nnz_overall * n_cells)[None].T - (f_nnz * sizes)) / (n_cells - sizes) # enrichment = (f_nnz + 0.1) / (f_nnz_overall[None].T + 0.1) * (means + 0.01) / (means_overall[None].T + 0.01) enrichment = (f_nnz + 0.1) / (f_nnz_other + 0.1) * (means + 0.01) / ( means_other + 0.01) # Select best markers if self.valid_genes is None: logging.info("Identifying valid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] self.valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6) if self.mask is None: excluded = set(np.where(~self.valid_genes)[0]) else: excluded = set(np.where(((~self.valid_genes) | self.mask))[0]) included = np.zeros(n_genes, dtype=bool) for ix in range(n_labels): enriched = np.argsort(enrichment[:, ix])[::-1] n = 0 count = 0 while count < self.n_markers_per_cluster: if enriched[n] in excluded: n += 1 continue included[enriched[n]] = True excluded.add(enriched[n]) n += 1 count += 1 return (included, enrichment, means)
def mito_genes_ratio(ds: loompy.LoomConnection) -> None: mito_genes = np.where(ds.ra.Chromosome == "MT")[0] exp_mito = ds[mito_genes, :] sum_mito = exp_mito.sum(axis=0) sum_all = ds.ca.TotalUMI mito_ratio = np.divide(sum_mito, sum_all) ds.ca["MT_ratio"] = mito_ratio
def fit(self, ds: loompy.LoomConnection) -> None: self.sd = np.zeros(ds.shape[0]) self.mu = np.zeros(ds.shape[0]) self.totals = np.zeros(ds.shape[1]) for _, selection, view in ds.scan(axis=0): vals = view[self.layer][:, :].astype("float") self.totals += np.sum(vals, axis=0) self.level = np.median(self.totals) for _, selection, view in ds.scan(axis=0): vals = view[self.layer][:, :].astype("float") # Rescale to the median total UMI count, plus 1 (to avoid log of zero), then log transform vals = np.log2(div0(vals, self.totals) * self.level + 1) self.mu[selection] = np.mean(vals, axis=1) self.sd[selection] = np.std(vals, axis=1)
def expression_patterns(ds: loompy.LoomConnection, labels: np.ndarray, pep: float, f: float, cells: np.ndarray = None) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Derive enrichment and trinary scores for all genes Args: ds (LoomConnection): Dataset labels (numpy array): Cluster labels (one per cell) pep (float): Desired posterior error probability f (float): Fraction required for a gene to be considered 'expressed' cells (nump array): Indices of cells to include Returns: score1 (numpy 2d array): Array of (n_genes, n_labels) score2 (numpy 2d array): Array of (n_genes, n_labels) trinary (numpy 2d array): Array of (n_genes, n_labels) Remarks: If the cells argument is provided, the labels should include only those cells. That is, labels.shape[0] == cells.shape[0]. Amit says, regarding marker genes. i usually rank the genes by some kind of enrichment score. score1 = mean of gene within the cluster / mean of gene in all cells score2 = fraction of positive cells within cluster enrichment score = score1 * score2^power (where power == 0.5 or 1) i usually use 1 for 10x data """ n_labels = np.max(labels) + 1 scores1 = np.empty((ds.shape[0], n_labels)) scores2 = np.empty((ds.shape[0], n_labels)) trinary_pat = np.empty((ds.shape[0], n_labels)) trinary_prob = np.empty((ds.shape[0], n_labels)) j = 0 for (ix, selection, vals) in ds.batch_scan(cells=cells, genes=None, axis=0): # vals = normalizer.normalize(vals, selection) for j, row in enumerate(selection): data = vals[j, :] mu0 = np.mean(data) f0 = np.count_nonzero(data) score1 = np.zeros(n_labels) score2 = np.zeros(n_labels) for lbl in range(n_labels): if np.sum(labels == lbl) == 0: continue sel = data[np.where(labels == lbl)[0]] if mu0 == 0 or f0 == 0: score1[lbl] = 0 score2[lbl] = 0 else: score1[lbl] = np.mean(sel) / mu0 score2[lbl] = np.count_nonzero(sel) # f0 scores1[row, :] = score1 scores2[row, :] = score2 trinary_prob[row, :], trinary_pat[row, :] = betabinomial_trinarize_array(data, labels, pep, f) return (scores1, scores2, trinary_prob, trinary_pat)
def transform(self, ds: loompy.LoomConnection, normalizer: cg.Normalizer, cells: np.ndarray = None) -> np.ndarray: if cells is None: cells = np.fromiter(range(ds.shape[1]), dtype='int') n_cells = cells.shape[0] # Support out--of-order datasets if self.accessions is not None: # This is magic sauce for making the order of one list be like another ordering = np.where(ds.row_attrs["Accession"][None, :] == self.accessions[:, None])[1] transformed = np.zeros((cells.shape[0], self.pca.n_components_)) j = 0 if self.layer is not None: # NOTE TO AVOID a BUG with layer of pickled objects try: for (ix, selection, view) in ds.scan(items=cells, axis=1, layers=[self.layer]): vals = normalizer.transform(view.layers[self.layer][:, :], selection) if self.nng is not None: vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0 n_cells_in_batch = selection.shape[0] transformed[j:j + n_cells_in_batch, :] = self.pca.transform(vals[self.genes, :].transpose()) j += n_cells_in_batch except AttributeError: self.layer = None if self.layer is None: for (ix, selection, view) in ds.scan(items=cells, axis=1): vals = normalizer.transform(view[:, :], selection) if self.nng is not None: vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0 n_cells_in_batch = selection.shape[0] transformed[j:j + n_cells_in_batch, :] = self.pca.transform(vals[self.genes, :].transpose()) j += n_cells_in_batch # Must select significant components only once, and reuse for future transformations if self.sigs is None: pvalue_KS = np.zeros(transformed.shape[1]) # pvalue of each component for i in range(1, transformed.shape[1]): (_, pvalue_KS[i]) = ks_2samp(transformed[:, i - 1], transformed[:, i]) self.sigs = np.where(pvalue_KS < 0.1)[0] if len(self.sigs) == 0: self.sigs = (0, 1) transformed = transformed[:, self.sigs] return transformed
def unspliced_ratio(ds: loompy.LoomConnection, graphs: bool = True, sample_name: object = "tmp") -> None: u = ds.layers["unspliced"][:] sum_all = ds.ca.TotalUMI sum_u = u.sum(axis=0) unspliced_ratio = np.divide(sum_u, sum_all) ds.ca["unspliced_ratio"] = unspliced_ratio
def fit(self, ds: loompy.LoomConnection, plot: str = None) -> np.ndarray: """ Fit a classifier and use it to determine cluster predictive power Args: ds Dataset plot Filename for optional plot Returns: Matrix of classification probabilities, shape (n_cells, n_labels) """ logging.info("Feature selection") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Feature selection") (_, enrichment, _) = cg.MarkerSelection(findq=False, labels_attr="Clusters").fit(ds) genes = np.zeros_like(ds.ra.Gene, dtype=bool) for ix in range(enrichment.shape[1]): genes[np.argsort(-enrichment[:, ix])[:25]] = True logging.info("PCA projection") pca = cg.PCAProjection(genes, max_n_components=50) transformed = pca.fit_transform(ds, normalizer) le = LabelEncoder().fit(ds.ca.ClusterName) self.le = le labels = le.transform(ds.ca.ClusterName) train_X, test_X, train_Y, test_Y = train_test_split(transformed, labels, test_size=0.2) classifier = RandomForestClassifier(max_depth=30) classifier.fit(train_X, train_Y) self.report = classification_report(test_Y, classifier.predict(test_X), target_names=le.classes_) self.proba = classifier.predict_proba(transformed) if plot: agg = npg.aggregate(labels, self.proba, axis=0, func="mean") plt.imshow(agg, cmap="viridis") plt.xticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="vertical", fontsize=7) plt.yticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="horizontal", fontsize=7) plt.xlabel("Predicted cell type") plt.ylabel("Observed cell type") plt.title("Predictive power of cluster identities") cbar = plt.colorbar() cbar.set_label('Average classification probability', rotation=90) plt.savefig(plot, bbox_inches="tight") return self.proba
def fit_loom(self, ds: loompy.LoomConnection, *, tolayer: str = "enrichment", knn: Union[str, sparse.csr_matrix] = "KNN") -> None: if tolayer not in ds.layers: ds[tolayer] = "float32" if type(knn) is str: knn_matrix = ds.col_graphs[knn].tocsr() else: knn_matrix = knn k = knn_matrix.count_nonzero() / knn_matrix.shape[0] with tqdm(total=ds.shape[0], desc="Neighborhood enrichment") as pbar: for ix, selection, view in ds.scan(axis=0, what=["layers"]): for j in range(view.shape[0]): ds[tolayer][j + ix, :] = self.fit(view[j, :], knn_matrix, k) pbar.update(view.shape[0])
def transform(self, ds: loompy.LoomConnection, normalizer: Normalizer, cells: np.ndarray = None) -> np.ndarray: if cells is None: cells = np.arange(ds.shape[1]) transformed = np.zeros((cells.shape[0], self.pca.n_components_)) j = 0 # Support out-of-order datasets key = None if "Accession" in ds.row_attrs: key = "Accession" layer = self.layer if self.layer is not None else "" for (_, selection, view) in ds.scan(items=cells, axis=1, layers=[layer], key=key): vals = normalizer.transform(view.layers[layer][:, :], selection) n_cells_in_batch = selection.shape[0] transformed[j:j + n_cells_in_batch, :] = self.pca.transform( vals[self.genes, :].transpose()) j += n_cells_in_batch if self.test_significance: # Must select significant components only once, and reuse for future transformations if self.sigs is None: pvalue_KS = np.zeros( transformed.shape[1]) # pvalue of each component for i in range(1, transformed.shape[1]): (_, pvalue_KS[i]) = ks_2samp(transformed[:, i - 1], transformed[:, i]) self.sigs = np.where(pvalue_KS < 0.1)[0] if len(self.sigs) == 0: self.sigs = (0, 1) transformed = transformed[:, self.sigs] if self.batch_keys is not None and len(self.batch_keys) > 0: keys_df = pd.DataFrame.from_dict( {k: ds.ca[k] for k in self.batch_keys}) transformed = harmonize(transformed, keys_df, batch_key=self.batch_keys) return transformed
def fit(self, ds: loompy.LoomConnection) -> np.ndarray: cells = np.where(ds.col_attrs["Clusters"] >= 0)[0] labels = ds.col_attrs["Clusters"][cells] n_labels = np.max(labels) + 1 logging.info("n_labels %d", n_labels) self.trinary_prob = np.empty((ds.shape[0], n_labels)) self.genes = ds.ra.Gene for (ix, selection, view) in ds.scan(axis=0, what=["layers"]): vals = view[:, cells] for j, row in enumerate(selection): data = np.round(vals[j, :]) self.trinary_prob[row, :] = self._betabinomial_trinarize_array( data, labels, self.f, n_labels) return self.trinary_prob
def fit(self, ds: loompy.LoomConnection, cells: np.ndarray = None, mu: np.ndarray = None, sd: np.ndarray = None, mask: np.ndarray = None) -> np.ndarray: """ Fits a noise model (CV vs mean) Args: ds (LoomConnection): Dataset n_genes (int): number of genes to include cells (ndarray): cells to include when computing mean and CV (or None) mu, std: Precomputed mean and standard deviations (optional) Returns: ndarray of selected genes (list of ints) """ if mu is None or sd is None: (mu, sd) = ds.map((np.mean, np.std), axis=0, selection=cells) if "_Valid" in ds.ra: valid = ds.ra._Valid == 1 else: valid = np.ones(ds.shape[0], dtype='bool') if mask is not None: valid = np.logical_and(valid, np.logical_not(mask)) valid = np.logical_and(valid, ds.row_attrs["Gene"] != "Xist") valid = np.logical_and(valid, ds.row_attrs["Gene"] != "Tsix") valid = valid.astype('int') ok = np.logical_and(mu > 0, sd > 0) cv = sd[ok] / mu[ok] log2_m = np.log2(mu[ok]) log2_cv = np.log2(cv) svr_gamma = 1000. / len(mu[ok]) clf = SVR(gamma=svr_gamma) clf.fit(log2_m[:, np.newaxis], log2_cv) fitted_fun = clf.predict # Score is the relative position with respect of the fitted curve score = log2_cv - fitted_fun(log2_m[:, np.newaxis]) score = score * valid[ok] self.genes = np.where(ok)[0][np.argsort(score)][-self.n_genes:] return self.genes
def plot_knn(ds: loompy.LoomConnection, out_file: str) -> None: n_cells = ds.shape[1] valid = ds.col_attrs["_Valid"].astype('bool') (a, b, w) = ds.get_edges("MKNN", axis=1) mknn = sparse.coo_matrix( (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid] xy = np.vstack( (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :] fig = plt.figure(figsize=(10, 10)) g = nx.from_scipy_sparse_matrix(mknn) ax = fig.add_subplot(111) nx.draw_networkx_edges(g, pos=xy, alpha=0.25, width=0.2, edge_color='gray') ax.axis('off') plt.tight_layout() fig.savefig(out_file, format="png", dpi=300) plt.close()
def fit(self, ds: loompy.LoomConnection) -> None: # Validating genes logging.info("Marking invalid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes with open(os.path.join(self.classified_dir, "genes.txt"), "w") as f: for ix in range(valid_genes.shape[0]): f.write(ds.Accession[ix]) f.write("\t") f.write(str(valid_genes[ix])) f.write("\n") logging.info("Normalization") normalizer = cg.Normalizer(True) normalizer.fit(ds) self.mu = normalizer.mu self.sd = normalizer.sd logging.info("Feature selection") genes = cg.FeatureSelection(2000).fit(ds) logging.info("PCA projection") self.pca = cg.PCAProjection(genes, max_n_components=50) transformed = self.pca.fit_transform(ds, normalizer) self.classes = ds.col_attrs["SubclassAssigned"] self.le = LabelEncoder().fit(self.classes) self.labels = self.le.transform(self.classes) train_X, test_X, train_Y, test_Y = train_test_split(transformed, self.labels, test_size=0.2, random_state=0) self.classifier = SVC(probability=True) self.classifier.fit(train_X, train_Y) with open(os.path.join(self.classified_dir, "performance.txt"), "w") as f: f.write( classification_report(test_Y, self.classifier.predict(test_X), target_names=self.le.classes_))
def connect(self, project: str, filename: str, mode: str="r+", timeout: float=10) -> LoomConnection: """ Try to connect to a local loom file. If returns None already connected. Uses a semaphore to ensure there is never more than one connection open to a loom file (as long as this is the only object that connects to the loom file) Remember to call `release(project, filename)` after closing the connection! Args: - project (string): Name of the project (e.g. "Midbrain") - filename (string): Filename of the loom file (e.g. "Midbrain_20160701.loom") Returns: A loom file connection, or None if file does not exist or was already connected. """ if self.acquire(project, filename, timeout): absolute_path = self.list.absolute_file_path(project, filename) return LoomConnection(absolute_path, mode) return None
def fit(self, ds: loompy.LoomConnection, normalizer: Normalizer, cells: np.ndarray = None) -> None: if cells is None: cells = np.fromiter(range(ds.shape[1]), dtype='int') # Support out-of-order datasets key = None if "Accession" in ds.row_attrs: key = "Accession" self.pca = IncrementalPCA(n_components=self.n_components) layer = self.layer if self.layer is not None else "" for (_, selection, view) in ds.scan(items=cells, axis=1, layers=[layer], key=key): if len(selection) < self.n_components: continue vals = normalizer.transform(view.layers[layer][:, :], selection) self.pca.partial_fit( vals[self.genes, :].transpose()) # PCA on the selected genes
def tile(self, project: str, filename: str, truncate: bool = False) -> None: absolute_path = self.list.absolute_file_path(project, filename) ds = None if absolute_path is not "": try: lock = self.connections.dataset_locks.get(absolute_path) if lock is not None and lock.acquire(blocking=True, timeout=10): ds = LoomConnection(absolute_path, 'r') tiles = LoomTiles(ds) tiles.prepare_heatmap(truncate) ds.close() lock.release() except TimeoutError: # May happen when cancelled by the environment (for example, on a server). # If so, and the lock was acquired, release the dataset and lock if ds is not None: ds.close() lock.release() pass
def aggregate(self, ds: loompy.LoomConnection, out_file: str, agg_spec: Dict[str, str] = None) -> None: if agg_spec is None: agg_spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "_Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean" } cells = ds.col_attrs["Clusters"] >= 0 labels = ds.col_attrs["Clusters"][cells] n_labels = len(set(labels)) logging.info("Aggregating clusters by mean") cg.aggregate_loom(ds, out_file, None, "Clusters", "mean", agg_spec) with loompy.connect(out_file) as dsout: logging.info("Trinarizing") if type(self.f) is list or type(self.f) is tuple: for ix, f in enumerate(self.f): trinaries = cg.Trinarizer(f=f).fit(ds) if ix == 0: dsout.layers["trinaries"] = trinaries else: dsout.layers[f"trinaries_{f}"] = trinaries else: trinaries = cg.Trinarizer(f=self.f).fit(ds) dsout.layers["trinaries"] = trinaries logging.info("Computing cluster gene enrichment scores") (markers, enrichment, qvals) = cg.MarkerSelection(self.n_markers).fit(ds) dsout.layers["enrichment"] = enrichment dsout.layers["enrichment_q"] = qvals dsout.ca.NCells = np.bincount(labels, minlength=n_labels) # Renumber the clusters logging.info( "Renumbering clusters by similarity, and permuting columns") if "_Selected" in ds.ra: genes = (ds.ra._Selected == 1) else: logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Selecting up to 1000 genes") genes = cg.FeatureSelection(1000).fit(ds, mu=normalizer.mu, sd=normalizer.sd) data = np.log(dsout[:, :] + 1)[genes, :].T D = pdist(data, 'euclidean') Z = hc.linkage(D, 'ward') optimal_Z = optimal_leaf_ordering(Z, D) ordering = hc.leaves_list(optimal_Z) # Permute the aggregated file, and renumber dsout.permute(ordering, axis=1) dsout.ca.Clusters = np.arange(n_labels) # Renumber the original file, and permute d = dict(zip(ordering, np.arange(n_labels))) new_clusters = np.array( [d[x] if x in d else -1 for x in ds.ca.Clusters]) ds.ca.Clusters = new_clusters ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1) # Reorder the genes, markers first, ordered by enrichment in clusters logging.info("Permuting rows") mask = np.zeros(ds.shape[0], dtype=bool) mask[markers] = True # fetch enrichment from the aggregated file, so we get it already permuted on the column axis gene_order = np.zeros(ds.shape[0], dtype='int') gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :], axis=1) gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :], axis=1) + dsout.shape[1] gene_order = np.argsort(gene_order) ds.permute(gene_order, axis=0) dsout.permute(gene_order, axis=0) data = trinaries[:, ordering][gene_order, :][:self.n_markers * n_labels, :].T cluster_scores = [] for ix in range(n_labels): cluster_scores.append(data[ix, ix * 10:(ix + 1) * 10].sum()) dsout.ca.ClusterScore = np.array(cluster_scores)
def plot_graph(ds: loompy.LoomConnection, out_file: str, tags: List[str] = None) -> None: logging.info("Loading graph") n_cells = ds.shape[1] cells = np.where(ds.col_attrs["_Valid"] == 1)[0] has_edges = False if "MKNN" in ds.list_edges(axis=1): (a, b, w) = ds.get_edges("MKNN", axis=1) has_edges = True pos = np.vstack((ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose() labels = ds.col_attrs["Clusters"] if "Outliers" in ds.col_attrs: outliers = ds.col_attrs["Outliers"] else: outliers = np.zeros(ds.shape[1]) # Compute a good size for the markers, based on local density logging.info("Computing node size") min_pts = 50 eps_pct = 60 nn = NearestNeighbors(n_neighbors=min_pts, algorithm="ball_tree", n_jobs=4) nn.fit(pos) knn = nn.kneighbors_graph(mode='distance') k_radius = knn.max(axis=1).toarray() epsilon = 24 * np.percentile(k_radius, eps_pct) fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) # Draw edges if has_edges: logging.info("Drawing edges") lc = LineCollection(zip(pos[a], pos[b]), linewidths=0.25, zorder=0, color='grey', alpha=0.1) ax.add_collection(lc) # Draw nodes logging.info("Drawing nodes") colors20 = np.vstack((plt.cm.Vega20b(np.linspace(0., 1, 20))[::2], plt.cm.Vega20c(np.linspace(0, 1, 20))[1::2])) plots = [] names = [] for i in range(max(labels) + 1): cluster = labels == i n_cells = cluster.sum() if np.all(outliers[labels == i] == 1): edgecolor = colorConverter.to_rgba('red', alpha=.1) plots.append( plt.scatter(x=pos[outliers == 1, 0], y=pos[outliers == 1, 1], c='grey', marker='.', edgecolors=edgecolor, alpha=0.1, s=epsilon)) names.append(f"{i}/n={n_cells} (outliers)") else: plots.append( plt.scatter(x=pos[cluster, 0], y=pos[cluster, 1], c=cg.colors75[np.mod(i, 75)], marker='.', lw=0, s=epsilon, alpha=0.75)) txt = str(i) if "ClusterName" in ds.ca.keys(): txt = ds.ca.ClusterName[ds.ca.Clusters == i][0] if tags is not None: names.append(f"{txt}/n={n_cells} " + tags[i].replace("\n", " ")) else: names.append(f"{txt}/n={n_cells}") logging.info("Drawing legend") plt.legend(plots, names, scatterpoints=1, markerscale=2, loc='upper left', bbox_to_anchor=(1, 1), fancybox=True, framealpha=0.5, fontsize=10) logging.info("Drawing cluster IDs") for lbl in range(0, max(labels) + 1): txt = str(lbl) if "ClusterName" in ds.ca.keys(): txt = ds.ca.ClusterName[ds.ca.Clusters == lbl][0] if np.all(outliers[labels == lbl] == 1): continue if np.sum(labels == lbl) == 0: continue (x, y) = np.median(pos[np.where(labels == lbl)[0]], axis=0) ax.text(x, y, txt, fontsize=12, bbox=dict(facecolor='white', alpha=0.5, ec='none')) logging.info("Saving to file") fig.savefig(out_file, format="png", dpi=144, bbox_inches='tight') plt.close()
def plot_classification(ds: loompy.LoomConnection, out_file: str) -> None: n_cells = ds.shape[1] valid = ds.col_attrs["_Valid"].astype('bool') (a, b, w) = ds.get_edges("MKNN", axis=1) mknn = sparse.coo_matrix( (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid] pos = np.vstack( (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :] labels = ds.col_attrs["Clusters"][valid] fig = plt.figure(figsize=(10, 10)) g = nx.from_scipy_sparse_matrix(mknn) classes = [ "Neurons", "Astrocyte", "Ependymal", "OEC", "Oligos", "Schwann", "Cycling", "Vascular", "Immune" ] colors = [plt.cm.get_cmap('Vega20')((ix + 0.5) / 20) for ix in range(20)] combined_colors = np.zeros((ds.shape[1], 4)) + np.array((0.5, 0.5, 0.5, 0)) for ix, cls in enumerate(classes): cmap = LinearSegmentedColormap.from_list('custom cmap', [(1, 1, 1, 0), colors[ix]]) cells = ds.col_attrs["Class0"] == classes[ix] if np.sum(cells) > 0: combined_colors[cells] = [ cmap(x) for x in ds.col_attrs["Class_" + classes[ix]][cells] ] cmap = LinearSegmentedColormap.from_list('custom cmap', [(1, 1, 1, 0), colors[ix + 1]]) ery_color = np.array( [[1, 1, 1, 0], [0.9, 0.71, 0.76, 0]])[(ds.col_attrs["Class"][valid] == "Erythrocyte").astype('int')] cells = ds.col_attrs["Class0"] == "Erythrocyte" if np.sum(cells) > 0: combined_colors[cells] = np.array([1, 0.71, 0.76, 0]) cmap = LinearSegmentedColormap.from_list('custom cmap', [(1, 1, 1, 0), colors[ix + 2]]) exc_color = np.array( [[1, 1, 1, 0], [0.5, 0.5, 0.5, 0]])[(ds.col_attrs["Class0"][valid] == "Excluded").astype('int')] cells = ds.col_attrs["Class0"] == "Excluded" if np.sum(cells) > 0: combined_colors[cells] = np.array([0.5, 0.5, 0.5, 0]) ax = fig.add_subplot(1, 1, 1) ax.set_title("Class") nx.draw_networkx_edges(g, pos=pos, alpha=0.2, width=0.1, edge_color='gray') nx.draw_networkx_nodes(g, pos=pos, node_color=combined_colors[valid], node_size=10, alpha=0.6, linewidths=0) ax.axis('off') plt.tight_layout() fig.savefig(out_file, format="png", dpi=300) plt.close()
def plot_graph_age(ds: loompy.LoomConnection, out_file: str, tags: List[str]) -> None: def parse_age(age: str) -> float: if age == "": return 0 unit, amount = age[0], float(age[1:]) if unit == "P": amount += 19. return amount n_cells = ds.shape[1] valid = ds.col_attrs["_Valid"].astype('bool') (a, b, w) = ds.get_edges("MKNN", axis=1) mknn = sparse.coo_matrix( (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid] sfdp = np.vstack( (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :] # The sorting below is to make every circle visible and avoid overlappings in crowded situations orderx = np.argsort(sfdp[:, 0], kind="mergesort") ordery = np.argsort(sfdp[:, 1], kind="mergesort") orderfin = orderx[ordery] sfdp_original = sfdp.copy( ) # still the draw_networkx_edges wants the sfd with respect of the graph `g` # \it is shortcut to avoid resorting the graph sfdp = sfdp[orderfin, :] labels = ds.col_attrs["Clusters"][valid][orderfin] age = np.fromiter(map(parse_age, ds.col_attrs["Age"]), dtype=float)[valid][orderfin] fig = plt.figure(figsize=(10, 10)) g = nx.from_scipy_sparse_matrix(mknn) ax = fig.add_subplot(111) # Draw the KNN graph first, with gray transparent edges nx.draw_networkx_edges(g, pos=sfdp_original, alpha=0.1, width=0.1, edge_color='gray') # Then draw the nodes, colored by label block_colors = plt.cm.nipy_spectral_r((age - 6) / 14.) nx.draw_networkx_nodes(g, pos=sfdp, node_color=block_colors, node_size=10, alpha=0.4, linewidths=0) for lbl in range(0, max(labels) + 1): if np.sum(labels == lbl) == 0: continue (x, y) = np.median(sfdp[np.where(labels == lbl)[0]], axis=0) text = "#" + str(lbl) if len(tags[lbl]) > 0: text += "\n" + tags[lbl] ax.text(x, y, text, fontsize=8, bbox=dict(facecolor='gray', alpha=0.3, ec='none')) ax.axis('off') levels = np.unique(age) for il, lev in enumerate(levels): ax.add_patch( plt.Rectangle((0.90, 0.7 + il * 0.016), 0.014, 0.014, color=plt.cm.nipy_spectral_r((lev - 6) / 14.), clip_on=0, transform=ax.transAxes)) ax.text(0.93, 0.703 + il * 0.016, ("E%.1f" % lev if lev < 18.5 else "P%.1f" % (lev - 19)), transform=ax.transAxes) plt.tight_layout() fig.savefig(out_file, format="png", dpi=300) plt.close()
def fit(self, ds: loompy.LoomConnection) -> None: logging.info("Computing pseudoage") ages = np.array([age_to_num(x) for x in ds.ca.Age]) knn = ds.col_graphs.KNN k = knn.nnz / knn.shape[0] ds.ca.PseudoAge = (knn.astype("bool") @ ages) / k logging.info("Slicing pseudoage") slice_names: List[str] = [] with TemporaryDirectory() as tempfolder: slices = np.percentile(ds.ca.PseudoAge, np.arange(0, 101, 5)) logging.info("Collecting cells") for (ix, _, view) in ds.scan(axis=1): for i in range(len(slices) - 2): s1 = slices[i] s2 = slices[i + 2] slice_name = f"Age{s1:05.2f}to{s2:05.2f}".replace( ".", "") + ".loom" if slice_name not in slice_names: slice_names.append(slice_name) cells = ((view.ca.PseudoAge >= s1) & (view.ca.PseudoAge < s2)) if cells.sum() == 0: continue fname = os.path.join(tempfolder, slice_name) if not os.path.exists(fname): with loompy.new(fname) as dsout: dsout.add_columns(view.layers[:, cells], col_attrs=view.ca[cells], row_attrs=view.ra) else: with loompy.connect(fname) as dsout: dsout.add_columns(view.layers[:, cells], col_attrs=view.ca[cells], row_attrs=view.ra) for slice_name in slice_names: fname = os.path.join(tempfolder, slice_name) logging.info("Cytograph on " + slice_name) with loompy.connect(fname) as ds: Cytograph(config=load_config()).fit(ds) # Use dynamic programming to find the deepest tree (forest), as given by total number of cells along each branch logging.info("Computing pseudolineage") clusters = "Clusters" min_pct = 0.1 # List of matrices giving the bipartite graph between each pair of layers, weighted by number of shared cells overlaps = [] n_nodes = [] # List of number of nodes (clusters) in each layer n_cells = [ ] # List of arrays giving the number of cells in each cluster n_layers = len(slice_names) # Compute the bipartite graphs between layers for t in range(n_layers): # Link clusters from layer t to clusters from layer t + 1 logging.info(f"{slice_names[t]}.loom") with loompy.connect(os.path.join(tempfolder, slice_names[t])) as ds1: n_nodes.append(ds1.ca[clusters].max() + 1) n_cells.append(np.zeros(n_nodes[t])) for c in range(n_nodes[t]): n_cells[t][c] = (ds1.ca[clusters] == c).sum() if t >= n_layers - 1: break with loompy.connect( os.path.join(tempfolder, slice_names[t + 1])) as ds2: overlap = np.zeros( (np.unique(ds1.ca[clusters]).shape[0], np.unique(ds2.ca[clusters]).shape[0]), dtype="int") for i in np.unique(ds1.ca[clusters]): cells1 = ds1.ca.CellID[ds1.ca[clusters] == i] for j in np.unique(ds2.ca[clusters]): cells2 = ds2.ca.CellID[ds2.ca[clusters] == j] overlap[i, j] = np.intersect1d(cells1, cells2).shape[0] overlaps.append(overlap) # List of arrays keeping track of the depth of the deepest tree starting at each node in the layer # Depth defined as sum of the number of shared cells along the branch depths = [np.zeros(n, dtype="int") for n in n_nodes] edges = [ np.zeros(n, dtype="int") for n in n_nodes[1:] ] # List of arrays giving the predecessor of each cluster (or -1 if no predecessor) for t in range(0, n_layers - 1): for i in range(n_nodes[t + 1]): # Now find the widest deepest branch from any node j in layer t to node i in layer t + 1 # Widest, deepest meaning: greatest sum of depth up to node j in layer t plus number of shared cells # But disallowing any branch with less than min_pct % shared cells best_j = -1 best_depth = 0 for j in range(n_nodes[t]): pct_overlapping = 100 * overlaps[t][j, i] / ( n_cells[t][j] + n_cells[t + 1][i]) if pct_overlapping > min_pct: depth = depths[t][j] + overlaps[t][j, i] if depth > best_depth: best_depth = depth best_j = j edges[t][i] = best_j # Now we have # # edges: List of arrays giving the index of the predecessor of each cluster (or -1 if no predecessor exists) # overlaps: List of matrices giving the number of cells shared between clusters in layer t and t + 1 # n_nodes: List of number of nodes (clusters) in each layer # n_cells: List of arrays of number of cells in each node (cluster) # Now position the nodes of each layer such that no edges cross ypositions = [np.arange(n_nodes[0])] for t in range(len(edges)): pos = np.full(n_nodes[t + 1], -1) for i in range(pos.shape[0]): prev = edges[t][i] if (prev) >= 0: pos[i] = ypositions[t][prev] ordering = np.argsort(pos) mapping = dict(zip(ordering, range(len(ordering)))) ypositions.append( np.array([mapping[i] for i in range(len(ordering))])) # Make the positions proportional to the number of cells (cumulative) max_pos = 0 for i, pos in enumerate(ypositions): with loompy.connect(os.path.join(tempfolder, slice_names[i])) as ds0: n_clusters = ds0.ca[clusters].max() + 1 ncells = np.array([(ds0.ca[clusters] == i).sum() for i in range(n_clusters)]) total = 0 new_pos = np.zeros_like(pos) for j in range(len(pos)): cluster = np.where(pos == j)[0] new_pos[cluster] = total + ncells[cluster] / 2 total += ncells[cluster] ypositions[i] = new_pos / 1000 max_pos = max(max_pos, max(ypositions[i])) for i, pos in enumerate(ypositions): ypositions[i] += (max_pos - np.max(pos)) / 2 # Then position the layers properly in time xpositions = [] for i in range(n_layers): with loompy.connect(os.path.join(tempfolder, slice_names[i])) as ds0: xpositions.append(np.mean(ds0.ca.PseudoAge)) # Now project each individual cell to the pseudolineage logging.info("Projecting cells to pseudolineage") cell_to_xy = {} for t in range(len(n_nodes) - 1): with loompy.connect(os.path.join(tempfolder, slice_names[t])) as ds0: with loompy.connect( os.path.join(tempfolder, slice_names[t + 1])) as ds1: for i in range(n_nodes[t + 1]): if edges[t][i] != -1: y1 = ypositions[t][edges[t][i]] y2 = ypositions[t + 1][i] offset = (xpositions[t + 1] - xpositions[t]) / 4 overlapping_cells = (ds1.ca[clusters] == i) & ( ds1.ca.PseudoAge < slices[t + 2]) crs = np.array( CatmullRomSpline( n_points=100).fit_transform( np.array( [[slices[t + 1] - offset, y1], [slices[t + 1], y1], [slices[t + 2], y2], [slices[t + 2] + offset, y2]]))) widths = np.linspace(n_cells[t][edges[t][i]], n_cells[t + 1][i], num=100) / 1500 f = interp1d(crs[:, 0], crs[:, 1], fill_value="extrapolate") fw = interp1d(crs[:, 0], widths, fill_value="extrapolate") y = f( ds1.ca.PseudoAge[overlapping_cells] ) + np.random.normal( scale=fw( ds1.ca.PseudoAge[overlapping_cells]) / 6, size=overlapping_cells.sum()) for i, ix in enumerate( np.where(overlapping_cells)[0]): cell_to_xy[ds1.ca.CellID[ix]] = [ ds1.ca.PseudoAge[ix], y[i] ] # Draw the leftmost pseudoage slice if t == 0: for i in range(n_nodes[0]): y1 = ypositions[0][i] y2 = ypositions[0][i] widths = np.linspace(n_cells[t][i], n_cells[t][i], num=100) / 1500 overlapping_cells = (ds0.ca[clusters] == i) & ( ds0.ca.PseudoAge < slices[1]) y = y1 + np.random.normal( scale=widths[0] / 6, size=overlapping_cells.sum()) for i, ix in enumerate( np.where(overlapping_cells)[0]): cell_to_xy[ds1.ca.CellID[ix]] = [ ds0.ca.PseudoAge[ix], y[i] ] # Draw the rightmost pseudoage slice if t == len(n_nodes) - 2: for i in range(n_nodes[-1]): y1 = ypositions[t][edges[t][i]] y2 = ypositions[t + 1][i] widths = np.linspace(n_cells[t][edges[t][i]], n_cells[t + 1][i], num=100) / 1500 overlapping_cells = (ds1.ca[clusters] == i) & ( ds1.ca.PseudoAge > slices[-2]) y = y2 + np.random.normal( scale=widths[-1] / 6, size=overlapping_cells.sum()) for i, ix in enumerate( np.where(overlapping_cells)[0]): cell_to_xy[ds1.ca.CellID[ix]] = [ ds1.ca.PseudoAge[ix], y[i] ] logging.info( "Saving pseudolineage projection back in original file") logging.info(ds.ca) return cell_to_xy xy = np.zeros((ds.shape[1], 2)) for i, cellid in enumerate(cell_to_xy.keys()): j = np.where(ds.ca.CellID == cellid)[0] xy[j] = cell_to_xy[cellid] ds.ca.PseudoLineage = xy
def fit(self, ds: loompy.LoomConnection, plot_file: str = None, report_file: str = None) -> np.ndarray: """ Fit a classifier and use it to determine cluster predictive power Args: ds Dataset plot_file Filename for optional plot report_file Filename for optional report Returns: Matrix of classification probabilities, shape (n_cells, n_labels) """ if "ClusterName" in ds.ca: cluster_names = [ str(ds.ca.ClusterName[ds.ca.Clusters == lbl][0]) for lbl in np.unique(ds.ca.Clusters) ] else: cluster_names = [str(lbl) for lbl in np.unique(ds.ca.Clusters)] genes = np.where(ds.ra.Selected == 1)[0] data = ds.sparse(rows=genes).T hpf = HPF(k=ds.ca.HPF.shape[1], validation_fraction=0.05, min_iter=10, max_iter=200, compute_X_ppv=False) hpf.fit(data) theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T train_X, test_X, train_Y, test_Y = train_test_split(theta, ds.ca.Clusters, test_size=0.2) classifier = RandomForestClassifier(max_depth=30) classifier.fit(train_X, train_Y) self.report = classification_report(test_Y, classifier.predict(test_X), labels=np.unique(ds.ca.Clusters), target_names=cluster_names) self.proba = classifier.predict_proba(theta) if plot_file is not None: plt.figure() agg = npg.aggregate(ds.ca.Clusters, self.proba, axis=0, func="mean") plt.imshow(agg, cmap="viridis") plt.xticks(np.arange(len(cluster_names)), cluster_names, rotation="vertical", fontsize=7) plt.yticks(np.arange(len(cluster_names)), cluster_names, rotation="horizontal", fontsize=7) plt.xlabel("Predicted cluster") plt.ylabel("Ground truth cluster") plt.title("Cluster quality (predictive power)") cbar = plt.colorbar() cbar.set_label('Probability of predicted cluster', rotation=90) plt.savefig(plot_file, bbox_inches="tight") plt.close() if report_file is not None: with open(report_file, "w") as f: f.write(self.report) return self.proba
def fit(self, ds: loompy.LoomConnection) -> None: logging.info(f"Normalizing and selecting {self.n_genes} genes") normalizer = Normalizer(False) normalizer.fit(ds) genes = FeatureSelectionByVariance(self.n_genes, mask=self.mask).fit(ds) self.genes = genes if self.factorization == 'PCA' or self.factorization == 'both' or self.batch_keys is not None: factorization = "PCA" else: factorization = "HPF" if factorization == "PCA": n_components = min(50, ds.shape[1]) logging.info("PCA projection to %d components", n_components) pca = PCA(genes, max_n_components=self.n_factors, test_significance=False, batch_keys=self.batch_keys) transformed = pca.fit_transform(ds, normalizer) else: data = ds.sparse(rows=genes).T # Subsample to lowest number of UMIs if "TotalUMI" in ds.ca: totals = ds.ca.TotalUMI else: totals = ds.map([np.sum], axis=1)[0] min_umis = int(np.min(totals)) logging.debug(f"Subsampling to {min_umis} UMIs") temp = data.toarray() for c in range(temp.shape[0]): temp[c, :] = np.random.binomial(temp[c, :].astype('int32'), min_umis / totals[c]) data = sparse.coo_matrix(temp) # HPF factorization hpf = HPF(k=self.n_factors, validation_fraction=0.05, min_iter=10, max_iter=200, compute_X_ppv=False, n_threads=self.n_threads) hpf.fit(data) transformed = ( hpf.theta.T / hpf.theta.sum(axis=1) ).T # Normalize so the sums are one because JSD requires it # KNN in latent space logging.info(f"Computing KNN (k={self.k_pooling}) in latent space") with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=NumbaPerformanceWarning ) # Suppress warnings about numba not being able to parallelize code warnings.simplefilter( "ignore", category=NumbaPendingDeprecationWarning ) # Suppress warnings about future deprecations warnings.simplefilter( "ignore", category=SparseEfficiencyWarning ) # Suppress warnings about setting the diagonal to 1 nn = NNDescent(data=transformed, metric=(jensen_shannon_distance if factorization == "HPF" else "euclidean")) indices, distances = nn.query(transformed, k=self.k_pooling) # Note: we convert distances to similarities here, to support Poisson smoothing below knn = sparse.csr_matrix( (np.ravel(distances), np.ravel(indices), np.arange(0, distances.shape[0] * distances.shape[1] + 1, distances.shape[1])), (transformed.shape[0], transformed.shape[0])) max_d = knn.data.max() knn.data = (max_d - knn.data) / max_d knn.setdiag( 1 ) # This causes a sparse efficiency warning, but it's not a slow step relative to everything else self.knn = knn
def aggregate_loom(ds: loompy.LoomConnection, out_file: str, select: np.ndarray, group_by: str, aggr_by: str, aggr_ca_by: Dict[str, str], return_matrix: bool = False) -> np.ndarray: """ Aggregate a Loom file by applying aggregation functions to the main matrix as well as to the column attributes Args: ds The Loom file out_file The name of the output Loom file (will be appended to if it exists) select Bool array giving the columns to include (or None, to include all) group_by The column attribute to group by aggr_by The aggregation function for the main matrix aggr_ca_by The aggregation functions for the column attributes (or None to skip) Remarks: aggr_by gives the aggregation function for the main matrix aggr_ca_by is a dictionary with column attributes as keys and aggregation functionas as values Aggregation functions can be any valid aggregation function from here: https://github.com/ml31415/numpy-groupies In addition, you can specify: "tally" to count the number of occurences of each value of a categorical attribute """ ca = {} # type: Dict[str, np.ndarray] if select is not None: raise ValueError("The 'select' argument is deprecated") labels = (ds.ca[group_by]).astype('int') _, zero_strt_sort_noholes_lbls = np.unique(labels, return_inverse=True) n_groups = len(set(labels)) if aggr_ca_by is not None: for key in ds.col_attrs.keys(): if key not in aggr_ca_by: continue func = aggr_ca_by[key] if func == "tally": for val in set(ds.col_attrs[key]): ca[key + "_" + val] = npg.aggregate( zero_strt_sort_noholes_lbls, (ds.col_attrs[key] == val).astype('int'), func="sum", fill_value=0) elif func == "mode": def mode(x): return scipy.stats.mode(x)[0][0] ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls, ds.col_attrs[key], func=mode, fill_value=0).astype('str') elif func == "mean": ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls, ds.col_attrs[key], func=func, fill_value=0) elif func == "first": ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls, ds.col_attrs[key], func=func, fill_value=ds.col_attrs[key][0]) m = np.empty((ds.shape[0], n_groups)) for (_, selection, view) in ds.scan(axis=0): vals_aggr = npg.aggregate(zero_strt_sort_noholes_lbls, view[:, :], func=aggr_by, axis=1, fill_value=0) m[selection, :] = vals_aggr if return_matrix: return m loompy.create_append(out_file, m, ds.ra, ca, fill_values="auto")
def fit(self, ds: loompy.LoomConnection) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Args: n_genes Number of genes to use for manifold learning (ignored if genes is not None) gtsnse Use graph t-SNE for layout (default: standard tSNE) alpha The scale parameter for multiscale KNN genes List of genes to use for manifold learning Returns: knn The multiscale knn graph as a sparse matrix, with k = 100 mknn Mutual knn subgraph, with k = 20 pos 2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2) """ n_valid = np.sum(ds.col_attrs["_Valid"] == 1) n_total = ds.shape[1] logging.info("%d of %d cells were valid", n_valid, n_total) logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0]) cells = np.where(ds.col_attrs["_Valid"] == 1)[0] logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) if self.filter_cellcycle is not None: cell_cycle_genes = np.array(open(self.filter_cellcycle).read().split()) mask = np.in1d(ds.ra.Gene, cell_cycle_genes) if np.sum(mask) == 0: logging.warn("None cell cycle genes where filtered, check your gene list") else: mask = None if self.genes is None: logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask) temp = np.zeros(ds.shape[0]) temp[genes] = 1 ds.set_attr("_Selected", temp, axis=0) logging.info("%d genes selected", temp.sum()) n_components = min(50, n_valid) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer) pca_transformed = pca.fit_transform(ds, normalizer, cells=cells) transformed = pca_transformed logging.info("Generating KNN graph") k = min(10, n_valid - 1) nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4) nn.fit(transformed) knn = nn.kneighbors_graph(mode='connectivity') knn = knn.tocoo() mknn = knn.minimum(knn.transpose()).tocoo() logging.info("Louvain-Jaccard clustering") lj = cg.LouvainJaccard(resolution=1) labels = lj.fit_predict(knn) # Make labels for excluded cells == -1 labels_all = np.zeros(ds.shape[1], dtype='int') + -1 labels_all[cells] = labels ds.set_attr("Clusters", labels_all, axis=1) n_labels = np.max(labels) + 1 logging.info("Found " + str(n_labels) + " LJ clusters") logging.info("Marker selection") (genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels), mask=mask).fit(ds) else: genes = self.genes temp = np.zeros(ds.shape[0]) temp[genes] = 1 ds.set_attr("_Selected", temp, axis=0) logging.info("%d genes selected", temp.sum()) n_components = min(50, n_valid) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer) pca_transformed = pca.fit_transform(ds, normalizer, cells=cells) transformed = pca_transformed logging.info("Generating KNN graph") k = min(10, n_valid - 1) nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4) nn.fit(transformed) knn = nn.kneighbors_graph(mode='connectivity') knn = knn.tocoo() mknn = knn.minimum(knn.transpose()).tocoo() logging.info("Louvain-Jaccard clustering") lj = cg.LouvainJaccard(resolution=1) labels = lj.fit_predict(knn) # Make labels for excluded cells == -1 labels_all = np.zeros(ds.shape[1], dtype='int') + -1 labels_all[cells] = labels ds.set_attr("Clusters", labels_all, axis=1) n_labels = np.max(labels) + 1 logging.info("Found " + str(n_labels) + " LJ clusters") logging.info("Marker selection") (genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds) # Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA cells_adjusted = cg.cap_select(labels, cells, int(n_valid * 0.2)) n_components = min(50, cells_adjusted.shape[0]) logging.info("PCA projection to %d components", n_components) pca = cg.PCAProjection(genes, max_n_components=n_components) pca.fit(ds, normalizer, cells=cells_adjusted) # Note that here we're transforming all cells; we just did the fit on the selection transformed = pca.transform(ds, normalizer, cells=cells) k = min(100, n_valid - 1) logging.info("Generating multiscale KNN graph (k = %d)", k) nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4) nn.fit(transformed) knn = nn.kneighbors(return_distance=False) # shape: (n_cells, k) n_cells = knn.shape[0] a = np.tile(np.arange(n_cells), k) b = np.reshape(knn.T, (n_cells * k,)) w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells) knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells)) threshold = w > 0.05 mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells)) mknn = mknn.minimum(mknn.transpose()).tocoo() perplexity = min(k, (n_valid - 1) / 3 - 1) if self.gtsne: logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed, knn=knn.tocsr()) else: logging.info("t-SNE layout") tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed) tsne_all = np.zeros((ds.shape[1], 2), dtype='int') + np.min(tsne_pos, axis=0) tsne_all[cells] = tsne_pos # Transform back to the full set of cells knn = sparse.coo_matrix((knn.data, (cells[knn.row], cells[knn.col])), shape=(n_total, n_total)) mknn = sparse.coo_matrix((mknn.data, (cells[mknn.row], cells[mknn.col])), shape=(n_total, n_total)) return (knn, mknn, tsne_all)
def fit( self, ds: loompy.LoomConnection ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]: """ Discover the manifold Returns: knn The knn graph as a sparse matrix mknn Mutual knn subgraph pos 2D projection (gt-SNE) as ndarray with shape (n_cells, 2) """ n_cells = ds.shape[1] logging.info("Processing all %d cells", n_cells) logging.info("Validating genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes logging.info("%d of %d genes were valid", np.sum(ds.ra._Valid == 1), ds.shape[0]) logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Selecting up to %d genes", self.n_genes) genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd) logging.info("Loading data for selected genes") data = np.zeros((n_cells, genes.shape[0])) for (ix, selection, view) in ds.scan(axis=1): data[selection - ix, :] = view[genes, :].T logging.info("Computing initial subspace KNN") subspaces = np.ones(data.shape) knn = subspace_knn_graph(data, subspaces) mknn = knn.minimum(knn.transpose()).tocoo() for t in range(5): logging.info(f"Refining subspace KNN (iteration {t + 1})") logging.info("Louvain clustering") graph = nx.from_scipy_sparse_matrix(mknn) partitions = community.best_partition(graph) labels = np.array( [partitions[key] for key in range(mknn.shape[0])]) ds.ca.Clusters = labels n_labels = np.max(labels) + 1 logging.info(f"Found {n_labels} clusters") logging.info("Marker selection") (_, enrichment, _) = cg.MarkerSelection(n_markers=10, findq=False).fit(ds) subspaces = np.zeros(data.shape) for ix in range(enrichment.shape[1]): for j in range(n_cells): subspaces[j, np.argsort(-enrichment[:, ix])[:self.n_genes // n_labels]] = 1 knn = subspace_knn_graph(data, subspaces) mknn = knn.minimum(knn.transpose()).tocoo() perplexity = min(self.k, (n_cells - 1) / 3 - 1) logging.info("gt-SNE layout") # Note that perplexity argument is ignored in this case, but must still be given # because bhtsne will check that it has a valid value tsne_pos = cg.TSNE(perplexity=perplexity).layout(data, knn=knn.tocsr()) return (knn, mknn, tsne_pos)
def _fit(self, ds: loompy.LoomConnection) -> Tuple[np.ndarray, np.ndarray]: """ Finds n_markers genes per cluster using enrichment score Args: ds (LoomConnection): Dataset Returns: ndarray of selected genes (list of ints) ndarray of enrichment scores """ labels = ds.ca[self.labels_attr] n_labels = max(labels) + 1 n_cells = ds.shape[1] # Number of cells per cluster sizes = np.bincount(labels, minlength=n_labels) # Number of nonzero values per cluster nnz = cg.aggregate_loom(ds, None, None, self.labels_attr, np.count_nonzero, None, return_matrix=True) # Mean value per cluster means = cg.aggregate_loom(ds, None, None, self.labels_attr, "mean", None, return_matrix=True) # Non-zeros and means over all cells (nnz_overall, means_overall) = ds.map([np.count_nonzero, np.mean], axis=0) # Scale by number of cells f_nnz = nnz / sizes f_nnz_overall = nnz_overall / n_cells # Means and fraction non-zero values in other clusters (per cluster) means_other = ((means_overall * n_cells)[None].T - (means * sizes)) / (n_cells - sizes) f_nnz_other = ((f_nnz_overall * n_cells)[None].T - (f_nnz * sizes)) / (n_cells - sizes) # enrichment = (f_nnz + 0.1) / (f_nnz_overall[None].T + 0.1) * (means + 0.01) / (means_overall[None].T + 0.01) enrichment = (f_nnz + 0.1) / (f_nnz_other + 0.1) * (means + 0.01) / (means_other + 0.01) # Select best markers if "_Valid" not in ds.ra: logging.info("Recomputing the list of valid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6) ds.ra._Valid = valid_genes.astype('int') included: List[int] = [] if self.mask is None: excluded = set(np.where(ds.row_attrs["_Valid"] == 0)[0]) else: excluded = set(np.where(np.logical_and(ds.row_attrs["_Valid"] == 0, self.mask))[0]) for ix in range(max(labels) + 1): enriched = np.argsort(enrichment[:, ix])[::-1] n = 0 count = 0 while count < self.n_markers: if enriched[n] in excluded: n += 1 continue included.append(enriched[n]) excluded.add(enriched[n]) n += 1 count += 1 return (np.array(included), enrichment)