def aggregate(self, ds: loompy.LoomConnection, out_file: str, agg_spec: Dict[str, str] = None) -> None: if agg_spec is None: agg_spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "_Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean" } cells = ds.col_attrs["Clusters"] >= 0 labels = ds.col_attrs["Clusters"][cells] n_labels = len(set(labels)) logging.info("Aggregating clusters by mean") cg.aggregate_loom(ds, out_file, None, "Clusters", "mean", agg_spec) with loompy.connect(out_file) as dsout: logging.info("Trinarizing") if type(self.f) is list or type(self.f) is tuple: for ix, f in enumerate(self.f): trinaries = cg.Trinarizer(f=f).fit(ds) if ix == 0: dsout.layers["trinaries"] = trinaries else: dsout.layers[f"trinaries_{f}"] = trinaries else: trinaries = cg.Trinarizer(f=self.f).fit(ds) dsout.layers["trinaries"] = trinaries logging.info("Computing cluster gene enrichment scores") (markers, enrichment, qvals) = cg.MarkerSelection(self.n_markers).fit(ds) dsout.layers["enrichment"] = enrichment dsout.layers["enrichment_q"] = qvals dsout.ca.NCells = np.bincount(labels, minlength=n_labels) # Renumber the clusters logging.info( "Renumbering clusters by similarity, and permuting columns") if "_Selected" in ds.ra: genes = (ds.ra._Selected == 1) else: logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Selecting up to 1000 genes") genes = cg.FeatureSelection(1000).fit(ds, mu=normalizer.mu, sd=normalizer.sd) data = np.log(dsout[:, :] + 1)[genes, :].T D = pdist(data, 'euclidean') Z = hc.linkage(D, 'ward') optimal_Z = optimal_leaf_ordering(Z, D) ordering = hc.leaves_list(optimal_Z) # Permute the aggregated file, and renumber dsout.permute(ordering, axis=1) dsout.ca.Clusters = np.arange(n_labels) # Renumber the original file, and permute d = dict(zip(ordering, np.arange(n_labels))) new_clusters = np.array( [d[x] if x in d else -1 for x in ds.ca.Clusters]) ds.ca.Clusters = new_clusters ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1) # Reorder the genes, markers first, ordered by enrichment in clusters logging.info("Permuting rows") mask = np.zeros(ds.shape[0], dtype=bool) mask[markers] = True # fetch enrichment from the aggregated file, so we get it already permuted on the column axis gene_order = np.zeros(ds.shape[0], dtype='int') gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :], axis=1) gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :], axis=1) + dsout.shape[1] gene_order = np.argsort(gene_order) ds.permute(gene_order, axis=0) dsout.permute(gene_order, axis=0) data = trinaries[:, ordering][gene_order, :][:self.n_markers * n_labels, :].T cluster_scores = [] for ix in range(n_labels): cluster_scores.append(data[ix, ix * 10:(ix + 1) * 10].sum()) dsout.ca.ClusterScore = np.array(cluster_scores)
def aggregate(self, ds: loompy.LoomConnection, *, out_file: str, agg_spec: Dict[str, str] = None) -> None: config = load_config() # Generic config, just to get the paths if agg_spec is None: agg_spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean", "PCW": "mean" } cells = ds.col_attrs["Clusters"] >= 0 labels = ds.col_attrs["Clusters"][cells] n_labels = len(set(labels)) logging.info("Aggregating clusters") ds.aggregate(out_file, None, "Clusters", "mean", agg_spec) with loompy.connect(out_file) as dsout: logging.info("Trinarizing") if type(self.f) is list or type(self.f) is tuple: for ix, f in enumerate(self.f): # type: ignore trinaries = Trinarizer(f=f).fit(ds) if ix == 0: dsout.layers["trinaries"] = trinaries else: dsout.layers[f"trinaries_{f}"] = trinaries else: trinaries = Trinarizer(f=self.f).fit(ds) # type:ignore dsout.layers["trinaries"] = trinaries logging.info("Computing cluster gene enrichment scores") fe = FeatureSelectionByMultilevelEnrichment(mask=self.mask) markers = fe.fit(ds) dsout.layers["enrichment"] = fe.enrichment dsout.ca.NCells = np.bincount(labels, minlength=n_labels) # Renumber the clusters logging.info( "Renumbering clusters by similarity, and permuting columns") data = np.log(dsout[:, :] + 1)[markers, :].T D = pdist(data, 'euclidean') Z = hc.linkage(D, 'ward', optimal_ordering=True) ordering = hc.leaves_list(Z) # Permute the aggregated file, and renumber dsout.permute(ordering, axis=1) dsout.ca.Clusters = np.arange(n_labels) # Redo the Ward's linkage just to get a tree that corresponds with the new ordering data = np.log(dsout[:, :] + 1)[markers, :].T D = pdist(data, 'euclidean') dsout.attrs.linkage = hc.linkage(D, 'ward', optimal_ordering=True) # Renumber the original file, and permute d = dict(zip(ordering, np.arange(n_labels))) new_clusters = np.array( [d[x] if x in d else -1 for x in ds.ca.Clusters]) ds.ca.Clusters = new_clusters ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1) # Reorder the genes, markers first, ordered by enrichment in clusters logging.info("Permuting rows") mask = np.zeros(ds.shape[0], dtype=bool) mask[markers] = True # fetch enrichment from the aggregated file, so we get it already permuted on the column axis gene_order = np.zeros(ds.shape[0], dtype='int') gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :], axis=1) gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :], axis=1) + dsout.shape[1] gene_order = np.argsort(gene_order) ds.permute(gene_order, axis=0) dsout.permute(gene_order, axis=0) logging.info("Computing auto-annotation") AutoAnnotator(root=config.paths.autoannotation, ds=dsout).annotate(dsout) logging.info("Computing auto-auto-annotation") AutoAutoAnnotator(n_genes=6).annotate(dsout) if "skeletonize" in config.steps: logging.info("Graph skeletonization") GraphSkeletonizer(min_pct=1).abstract(ds, dsout)