def _fit(self, ds: loompy.LoomConnection,
             labels: np.ndarray) -> np.ndarray:
        logging.info("Computing enrichment statistic")
        n_labels = len(np.unique(labels))
        n_genes, n_cells = ds.shape

        # Number of cells per cluster
        sizes = np.bincount(labels, minlength=n_labels)
        # Number of nonzero values per cluster
        nnz = ds.aggregate(None, None, labels, np.count_nonzero, None)
        # Mean value per cluster
        means = ds.aggregate(None, None, labels, "mean", None)
        # Non-zeros and means over all cells
        (nnz_overall, means_overall) = ds.map([np.count_nonzero, np.mean],
                                              axis=0)
        # Scale by number of cells
        f_nnz = nnz / sizes
        f_nnz_overall = nnz_overall / n_cells

        # Means and fraction non-zero values in other clusters (per cluster)
        means_other = ((means_overall * n_cells)[None].T -
                       (means * sizes)) / (n_cells - sizes)
        f_nnz_other = ((f_nnz_overall * n_cells)[None].T -
                       (f_nnz * sizes)) / (n_cells - sizes)

        # enrichment = (f_nnz + 0.1) / (f_nnz_overall[None].T + 0.1) * (means + 0.01) / (means_overall[None].T + 0.01)
        enrichment = (f_nnz + 0.1) / (f_nnz_other + 0.1) * (means + 0.01) / (
            means_other + 0.01)

        # Select best markers
        if self.valid_genes is None:
            logging.info("Identifying valid genes")
            nnz = ds.map([np.count_nonzero], axis=0)[0]
            self.valid_genes = np.logical_and(nnz > 10,
                                              nnz < ds.shape[1] * 0.6)

        if self.mask is None:
            excluded = set(np.where(~self.valid_genes)[0])
        else:
            excluded = set(np.where(((~self.valid_genes) | self.mask))[0])

        included = np.zeros(n_genes, dtype=bool)
        for ix in range(n_labels):
            enriched = np.argsort(enrichment[:, ix])[::-1]
            n = 0
            count = 0
            while count < self.n_markers_per_cluster:
                if enriched[n] in excluded:
                    n += 1
                    continue
                included[enriched[n]] = True
                excluded.add(enriched[n])
                n += 1
                count += 1
        return (included, enrichment, means)
Exemple #2
0
    def aggregate(self,
                  ds: loompy.LoomConnection,
                  *,
                  out_file: str,
                  agg_spec: Dict[str, str] = None) -> None:
        config = load_config()  # Generic config, just to get the paths
        if agg_spec is None:
            agg_spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean",
                "PCW": "mean"
            }
        cells = ds.col_attrs["Clusters"] >= 0
        labels = ds.col_attrs["Clusters"][cells]
        n_labels = len(set(labels))

        logging.info("Aggregating clusters")
        ds.aggregate(out_file, None, "Clusters", "mean", agg_spec)
        with loompy.connect(out_file) as dsout:
            logging.info("Trinarizing")
            if type(self.f) is list or type(self.f) is tuple:
                for ix, f in enumerate(self.f):  # type: ignore
                    trinaries = Trinarizer(f=f).fit(ds)
                    if ix == 0:
                        dsout.layers["trinaries"] = trinaries
                    else:
                        dsout.layers[f"trinaries_{f}"] = trinaries
            else:
                trinaries = Trinarizer(f=self.f).fit(ds)  # type:ignore
                dsout.layers["trinaries"] = trinaries

            logging.info("Computing cluster gene enrichment scores")
            fe = FeatureSelectionByMultilevelEnrichment(mask=self.mask)
            markers = fe.fit(ds)
            dsout.layers["enrichment"] = fe.enrichment

            dsout.ca.NCells = np.bincount(labels, minlength=n_labels)

            # Renumber the clusters
            logging.info(
                "Renumbering clusters by similarity, and permuting columns")

            data = np.log(dsout[:, :] + 1)[markers, :].T
            D = pdist(data, 'euclidean')
            Z = hc.linkage(D, 'ward', optimal_ordering=True)
            ordering = hc.leaves_list(Z)

            # Permute the aggregated file, and renumber
            dsout.permute(ordering, axis=1)
            dsout.ca.Clusters = np.arange(n_labels)

            # Redo the Ward's linkage just to get a tree that corresponds with the new ordering
            data = np.log(dsout[:, :] + 1)[markers, :].T
            D = pdist(data, 'euclidean')
            dsout.attrs.linkage = hc.linkage(D, 'ward', optimal_ordering=True)

            # Renumber the original file, and permute
            d = dict(zip(ordering, np.arange(n_labels)))
            new_clusters = np.array(
                [d[x] if x in d else -1 for x in ds.ca.Clusters])
            ds.ca.Clusters = new_clusters
            ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1)

            # Reorder the genes, markers first, ordered by enrichment in clusters
            logging.info("Permuting rows")
            mask = np.zeros(ds.shape[0], dtype=bool)
            mask[markers] = True
            # fetch enrichment from the aggregated file, so we get it already permuted on the column axis
            gene_order = np.zeros(ds.shape[0], dtype='int')
            gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :],
                                         axis=1)
            gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :],
                                          axis=1) + dsout.shape[1]
            gene_order = np.argsort(gene_order)
            ds.permute(gene_order, axis=0)
            dsout.permute(gene_order, axis=0)

            logging.info("Computing auto-annotation")
            AutoAnnotator(root=config.paths.autoannotation,
                          ds=dsout).annotate(dsout)

            logging.info("Computing auto-auto-annotation")
            AutoAutoAnnotator(n_genes=6).annotate(dsout)

            if "skeletonize" in config.steps:
                logging.info("Graph skeletonization")
                GraphSkeletonizer(min_pct=1).abstract(ds, dsout)