コード例 #1
0
ファイル: aggregator.py プロジェクト: chitrita/cytograph
    def aggregate(self,
                  ds: loompy.LoomConnection,
                  out_file: str,
                  agg_spec: Dict[str, str] = None) -> None:
        if agg_spec is None:
            agg_spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "_Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean"
            }
        cells = ds.col_attrs["Clusters"] >= 0
        labels = ds.col_attrs["Clusters"][cells]
        n_labels = len(set(labels))

        logging.info("Aggregating clusters by mean")
        cg.aggregate_loom(ds, out_file, None, "Clusters", "mean", agg_spec)
        with loompy.connect(out_file) as dsout:
            logging.info("Trinarizing")
            if type(self.f) is list or type(self.f) is tuple:
                for ix, f in enumerate(self.f):
                    trinaries = cg.Trinarizer(f=f).fit(ds)
                    if ix == 0:
                        dsout.layers["trinaries"] = trinaries
                    else:
                        dsout.layers[f"trinaries_{f}"] = trinaries
            else:
                trinaries = cg.Trinarizer(f=self.f).fit(ds)
                dsout.layers["trinaries"] = trinaries

            logging.info("Computing cluster gene enrichment scores")
            (markers, enrichment,
             qvals) = cg.MarkerSelection(self.n_markers).fit(ds)
            dsout.layers["enrichment"] = enrichment
            dsout.layers["enrichment_q"] = qvals

            dsout.ca.NCells = np.bincount(labels, minlength=n_labels)

            # Renumber the clusters
            logging.info(
                "Renumbering clusters by similarity, and permuting columns")
            if "_Selected" in ds.ra:
                genes = (ds.ra._Selected == 1)
            else:
                logging.info("Normalization")
                normalizer = cg.Normalizer(False)
                normalizer.fit(ds)
                logging.info("Selecting up to 1000 genes")
                genes = cg.FeatureSelection(1000).fit(ds,
                                                      mu=normalizer.mu,
                                                      sd=normalizer.sd)

            data = np.log(dsout[:, :] + 1)[genes, :].T
            D = pdist(data, 'euclidean')
            Z = hc.linkage(D, 'ward')
            optimal_Z = optimal_leaf_ordering(Z, D)
            ordering = hc.leaves_list(optimal_Z)

            # Permute the aggregated file, and renumber
            dsout.permute(ordering, axis=1)
            dsout.ca.Clusters = np.arange(n_labels)

            # Renumber the original file, and permute
            d = dict(zip(ordering, np.arange(n_labels)))
            new_clusters = np.array(
                [d[x] if x in d else -1 for x in ds.ca.Clusters])
            ds.ca.Clusters = new_clusters
            ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1)

            # Reorder the genes, markers first, ordered by enrichment in clusters
            logging.info("Permuting rows")
            mask = np.zeros(ds.shape[0], dtype=bool)
            mask[markers] = True
            # fetch enrichment from the aggregated file, so we get it already permuted on the column axis
            gene_order = np.zeros(ds.shape[0], dtype='int')
            gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :],
                                         axis=1)
            gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :],
                                          axis=1) + dsout.shape[1]
            gene_order = np.argsort(gene_order)
            ds.permute(gene_order, axis=0)
            dsout.permute(gene_order, axis=0)

            data = trinaries[:, ordering][gene_order, :][:self.n_markers *
                                                         n_labels, :].T
            cluster_scores = []
            for ix in range(n_labels):
                cluster_scores.append(data[ix, ix * 10:(ix + 1) * 10].sum())
            dsout.ca.ClusterScore = np.array(cluster_scores)
コード例 #2
0
    def aggregate(self,
                  ds: loompy.LoomConnection,
                  *,
                  out_file: str,
                  agg_spec: Dict[str, str] = None) -> None:
        config = load_config()  # Generic config, just to get the paths
        if agg_spec is None:
            agg_spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean",
                "PCW": "mean"
            }
        cells = ds.col_attrs["Clusters"] >= 0
        labels = ds.col_attrs["Clusters"][cells]
        n_labels = len(set(labels))

        logging.info("Aggregating clusters")
        ds.aggregate(out_file, None, "Clusters", "mean", agg_spec)
        with loompy.connect(out_file) as dsout:
            logging.info("Trinarizing")
            if type(self.f) is list or type(self.f) is tuple:
                for ix, f in enumerate(self.f):  # type: ignore
                    trinaries = Trinarizer(f=f).fit(ds)
                    if ix == 0:
                        dsout.layers["trinaries"] = trinaries
                    else:
                        dsout.layers[f"trinaries_{f}"] = trinaries
            else:
                trinaries = Trinarizer(f=self.f).fit(ds)  # type:ignore
                dsout.layers["trinaries"] = trinaries

            logging.info("Computing cluster gene enrichment scores")
            fe = FeatureSelectionByMultilevelEnrichment(mask=self.mask)
            markers = fe.fit(ds)
            dsout.layers["enrichment"] = fe.enrichment

            dsout.ca.NCells = np.bincount(labels, minlength=n_labels)

            # Renumber the clusters
            logging.info(
                "Renumbering clusters by similarity, and permuting columns")

            data = np.log(dsout[:, :] + 1)[markers, :].T
            D = pdist(data, 'euclidean')
            Z = hc.linkage(D, 'ward', optimal_ordering=True)
            ordering = hc.leaves_list(Z)

            # Permute the aggregated file, and renumber
            dsout.permute(ordering, axis=1)
            dsout.ca.Clusters = np.arange(n_labels)

            # Redo the Ward's linkage just to get a tree that corresponds with the new ordering
            data = np.log(dsout[:, :] + 1)[markers, :].T
            D = pdist(data, 'euclidean')
            dsout.attrs.linkage = hc.linkage(D, 'ward', optimal_ordering=True)

            # Renumber the original file, and permute
            d = dict(zip(ordering, np.arange(n_labels)))
            new_clusters = np.array(
                [d[x] if x in d else -1 for x in ds.ca.Clusters])
            ds.ca.Clusters = new_clusters
            ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1)

            # Reorder the genes, markers first, ordered by enrichment in clusters
            logging.info("Permuting rows")
            mask = np.zeros(ds.shape[0], dtype=bool)
            mask[markers] = True
            # fetch enrichment from the aggregated file, so we get it already permuted on the column axis
            gene_order = np.zeros(ds.shape[0], dtype='int')
            gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :],
                                         axis=1)
            gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :],
                                          axis=1) + dsout.shape[1]
            gene_order = np.argsort(gene_order)
            ds.permute(gene_order, axis=0)
            dsout.permute(gene_order, axis=0)

            logging.info("Computing auto-annotation")
            AutoAnnotator(root=config.paths.autoannotation,
                          ds=dsout).annotate(dsout)

            logging.info("Computing auto-auto-annotation")
            AutoAutoAnnotator(n_genes=6).annotate(dsout)

            if "skeletonize" in config.steps:
                logging.info("Graph skeletonization")
                GraphSkeletonizer(min_pct=1).abstract(ds, dsout)