def _fit(self, ds: loompy.LoomConnection, labels: np.ndarray) -> np.ndarray: logging.info("Computing enrichment statistic") n_labels = len(np.unique(labels)) n_genes, n_cells = ds.shape # Number of cells per cluster sizes = np.bincount(labels, minlength=n_labels) # Number of nonzero values per cluster nnz = ds.aggregate(None, None, labels, np.count_nonzero, None) # Mean value per cluster means = ds.aggregate(None, None, labels, "mean", None) # Non-zeros and means over all cells (nnz_overall, means_overall) = ds.map([np.count_nonzero, np.mean], axis=0) # Scale by number of cells f_nnz = nnz / sizes f_nnz_overall = nnz_overall / n_cells # Means and fraction non-zero values in other clusters (per cluster) means_other = ((means_overall * n_cells)[None].T - (means * sizes)) / (n_cells - sizes) f_nnz_other = ((f_nnz_overall * n_cells)[None].T - (f_nnz * sizes)) / (n_cells - sizes) # enrichment = (f_nnz + 0.1) / (f_nnz_overall[None].T + 0.1) * (means + 0.01) / (means_overall[None].T + 0.01) enrichment = (f_nnz + 0.1) / (f_nnz_other + 0.1) * (means + 0.01) / ( means_other + 0.01) # Select best markers if self.valid_genes is None: logging.info("Identifying valid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] self.valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6) if self.mask is None: excluded = set(np.where(~self.valid_genes)[0]) else: excluded = set(np.where(((~self.valid_genes) | self.mask))[0]) included = np.zeros(n_genes, dtype=bool) for ix in range(n_labels): enriched = np.argsort(enrichment[:, ix])[::-1] n = 0 count = 0 while count < self.n_markers_per_cluster: if enriched[n] in excluded: n += 1 continue included[enriched[n]] = True excluded.add(enriched[n]) n += 1 count += 1 return (included, enrichment, means)
def aggregate(self, ds: loompy.LoomConnection, *, out_file: str, agg_spec: Dict[str, str] = None) -> None: config = load_config() # Generic config, just to get the paths if agg_spec is None: agg_spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean", "PCW": "mean" } cells = ds.col_attrs["Clusters"] >= 0 labels = ds.col_attrs["Clusters"][cells] n_labels = len(set(labels)) logging.info("Aggregating clusters") ds.aggregate(out_file, None, "Clusters", "mean", agg_spec) with loompy.connect(out_file) as dsout: logging.info("Trinarizing") if type(self.f) is list or type(self.f) is tuple: for ix, f in enumerate(self.f): # type: ignore trinaries = Trinarizer(f=f).fit(ds) if ix == 0: dsout.layers["trinaries"] = trinaries else: dsout.layers[f"trinaries_{f}"] = trinaries else: trinaries = Trinarizer(f=self.f).fit(ds) # type:ignore dsout.layers["trinaries"] = trinaries logging.info("Computing cluster gene enrichment scores") fe = FeatureSelectionByMultilevelEnrichment(mask=self.mask) markers = fe.fit(ds) dsout.layers["enrichment"] = fe.enrichment dsout.ca.NCells = np.bincount(labels, minlength=n_labels) # Renumber the clusters logging.info( "Renumbering clusters by similarity, and permuting columns") data = np.log(dsout[:, :] + 1)[markers, :].T D = pdist(data, 'euclidean') Z = hc.linkage(D, 'ward', optimal_ordering=True) ordering = hc.leaves_list(Z) # Permute the aggregated file, and renumber dsout.permute(ordering, axis=1) dsout.ca.Clusters = np.arange(n_labels) # Redo the Ward's linkage just to get a tree that corresponds with the new ordering data = np.log(dsout[:, :] + 1)[markers, :].T D = pdist(data, 'euclidean') dsout.attrs.linkage = hc.linkage(D, 'ward', optimal_ordering=True) # Renumber the original file, and permute d = dict(zip(ordering, np.arange(n_labels))) new_clusters = np.array( [d[x] if x in d else -1 for x in ds.ca.Clusters]) ds.ca.Clusters = new_clusters ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1) # Reorder the genes, markers first, ordered by enrichment in clusters logging.info("Permuting rows") mask = np.zeros(ds.shape[0], dtype=bool) mask[markers] = True # fetch enrichment from the aggregated file, so we get it already permuted on the column axis gene_order = np.zeros(ds.shape[0], dtype='int') gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :], axis=1) gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :], axis=1) + dsout.shape[1] gene_order = np.argsort(gene_order) ds.permute(gene_order, axis=0) dsout.permute(gene_order, axis=0) logging.info("Computing auto-annotation") AutoAnnotator(root=config.paths.autoannotation, ds=dsout).annotate(dsout) logging.info("Computing auto-auto-annotation") AutoAutoAnnotator(n_genes=6).annotate(dsout) if "skeletonize" in config.steps: logging.info("Graph skeletonization") GraphSkeletonizer(min_pct=1).abstract(ds, dsout)