Example #1
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            with loompy.connect(self.input().fn) as ds:
                cg.Aggregator(self.n_markers).aggregate(ds, out_file)
            with loompy.connect(out_file) as dsagg:
                for ix, score in enumerate(dsagg.col_attrs["ClusterScore"]):
                    logging.info(f"Cluster {ix} score {score:.1f}")

                logging.info("Computing auto-annotation")
                aa = cg.AutoAnnotator(root=am.paths().autoannotation)
                aa.annotate_loom(dsagg)
                aa.save_in_loom(dsagg)

                logging.info("Computing auto-auto-annotation")
                n_clusters = dsagg.shape[1]
                (selected, selectivity, specificity,
                 robustness) = cg.AutoAutoAnnotator(
                     n_genes=self.n_auto_genes).fit(dsagg)
                dsagg.set_attr("MarkerGenes",
                               np.array([
                                   " ".join(ds.ra.Gene[selected[:, ix]])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                np.set_printoptions(precision=1, suppress=True)
                dsagg.set_attr("MarkerSelectivity",
                               np.array([
                                   str(selectivity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerSpecificity",
                               np.array([
                                   str(specificity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerRobustness",
                               np.array([
                                   str(robustness[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
Example #2
0
    def aggregate_export(self) -> None:
        # Aggregate and compute enrichment, trinaries etc.
        logging.info("Aggregating loom file")
        ds_training = loompy.connect(
            os.path.join(self.classified_dir, "classified.loom"))
        classes = ds_training.col_attrs["SubclassAssigned"]
        ds_training.ca.Clusters = LabelEncoder().fit_transform(classes)
        out_file = os.path.join(self.classified_dir, "classified.agg.loom")
        cg.Aggregator(10).aggregate(ds_training, out_file)
        with loompy.connect(out_file) as dsagg:
            logging.info("Computing auto-annotation")
            aa = cg.AutoAnnotator(root=cg.paths().autoannotation)
            aa.annotate_loom(dsagg)
            aa.save_in_loom(dsagg)

            logging.info("Computing auto-auto-annotation")
            n_clusters = dsagg.shape[1]
            (selected, selectivity, specificity,
             robustness) = cg.AutoAutoAnnotator(n_genes=6).fit(dsagg)
            dsagg.ca.MarkerGenes = np.array([
                " ".join(ds_training.Gene[selected[:, ix]])
                for ix in np.arange(n_clusters)
            ])
            np.set_printoptions(precision=1, suppress=True)
            dsagg.ca.MarkerSelectivity = np.array(
                [str(selectivity[:, ix]) for ix in np.arange(n_clusters)])
            dsagg.ca.MarkerSpecificity = np.array(
                [str(specificity[:, ix]) for ix in np.arange(n_clusters)])
            dsagg.ca.MarkerRobustness = np.array(
                [str(robustness[:, ix]) for ix in np.arange(n_clusters)])

        out_dir = os.path.join(self.classified_dir, "classified_exported")
        logging.info("Exporting cluster data")
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        with loompy.connect(out_file) as dsagg:
            dsagg.export(os.path.join(out_dir, "classified_expression.tab"))
            dsagg.export(os.path.join(out_dir, "classified_enrichment.tab"),
                         layer="enrichment")
            dsagg.export(os.path.join(out_dir, "classified_enrichment_q.tab"),
                         layer="enrichment_q")
            dsagg.export(os.path.join(out_dir, "classified_trinaries.tab"),
                         layer="trinaries")
Example #3
0
	def run(self) -> None:
		logging = cg.logging(self, True)
		logging.info("Exporting cluster data")
		with self.output().temporary_path() as out_dir:
			if not os.path.exists(out_dir):
				os.mkdir(out_dir)
			with loompy.connect(self.input()[0].fn) as dsagg:
				logging.info("Computing auto-annotation")
				aa = cg.AutoAnnotator(root=am.paths().autoannotation)
				aa.annotate_loom(dsagg)
				aa.save_in_loom(dsagg)

				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_expression.tab"))
				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_enrichment.tab"), layer="enrichment")
				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_enrichment_q.tab"), layer="enrichment_q")
				dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_trinaries.tab"), layer="trinaries")

				ds = loompy.connect(self.input()[1].fn)

				logging.info("Plotting MKNN graph")
				cg.plot_knn(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.mknn.png"))

				# logging.info("Plotting Louvain resolution")
				# cg.plot_louvain(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.louvain.png"))

				try:
					logging.info("Plotting manifold graph with classes")
					cg.plot_classes(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.classes.png"))
				except Exception:
					pass

				logging.info("Plotting manifold graph with auto-annotation")
				tags = list(dsagg.col_attrs["AutoAnnotation"])
				cg.plot_graph(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.aa.png"), tags)

				logging.info("Plotting manifold graph with auto-auto-annotation")
				tags = list(dsagg.col_attrs["MarkerGenes"])
				cg.plot_graph(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.aaa.png"), tags)

				logging.info("Plotting marker heatmap")
				cg.plot_markerheatmap(ds, dsagg, n_markers_per_cluster=self.n_markers, out_file=os.path.join(out_dir, "L1_" + self.tissue + "_heatmap.pdf"))
Example #4
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_dir:
            logging.info("Exporting cluster data")
            if not os.path.exists(out_dir):
                os.mkdir(out_dir)
            dsagg = loompy.connect(self.input()[0].fn)
            logging.info("Computing auto-annotation")
            aa = cg.AutoAnnotator(root=am.paths().autoannotation)
            aa.annotate_loom(dsagg)
            aa.save_in_loom(dsagg)

            dsagg.export(
                os.path.join(out_dir, "L3_" + self.target + "_expression.tab"))
            dsagg.export(os.path.join(out_dir,
                                      "L3_" + self.target + "_enrichment.tab"),
                         layer="enrichment")
            dsagg.export(os.path.join(
                out_dir, "L3_" + self.target + "_enrichment_q.tab"),
                         layer="enrichment_q")
            dsagg.export(os.path.join(out_dir,
                                      "L3_" + self.target + "_trinaries.tab"),
                         layer="trinaries")

            logging.info("Plotting manifold graph with auto-annotation")
            tags = list(dsagg.col_attrs["AutoAnnotation"][np.argsort(
                dsagg.col_attrs["Clusters"])])
            ds = loompy.connect(self.input()[1].fn)
            cg.plot_graph(
                ds,
                os.path.join(out_dir,
                             "L3_" + self.target + "_manifold.aa.png"), tags)

            logging.info("Plotting manifold graph with auto-auto-annotation")
            tags = list(dsagg.col_attrs["MarkerGenes"][np.argsort(
                dsagg.col_attrs["Clusters"])])
            cg.plot_graph(
                ds,
                os.path.join(out_dir,
                             "L3_" + self.target + "_manifold.aaa.png"), tags)

            logging.info("Plotting marker heatmap")
            cg.plot_markerheatmap(ds,
                                  dsagg,
                                  n_markers_per_cluster=self.n_markers,
                                  out_file=os.path.join(
                                      out_dir,
                                      "L3_" + self.target + "_heatmap.pdf"))

            logging.info("Computing discordance distances")
            pep = 0.05
            n_labels = dsagg.shape[1]

            def discordance_distance(a: np.ndarray, b: np.ndarray) -> float:
                """
				Number of genes that are discordant with given PEP, divided by number of clusters
				"""
                return np.sum((1 - a) * b + a * (1 - b) > 1 - pep) / n_labels

            data = dsagg.layer["trinaries"][:n_labels * 10, :].T
            D = squareform(pdist(data, discordance_distance))
            with open(
                    os.path.join(out_dir,
                                 "L3_" + self.target + "_distances.txt"),
                    "w") as f:
                f.write(str(np.diag(D, k=1)))
Example #5
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            ds = loompy.connect(self.input().fn)
            spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "_Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean",
                "Bucket": "mode",
                "Region": "first",
                "OriginalClusters": "first",
                "Probable_location": "first",
                "Developmental_compartment": "first",
                "Description": "first",
                "Location_based_on": "first",
                "Neurotransmitter": "first",
                "LeafOrder": "first",
                "Comment": "first",
                "ClusterName": "first",
                "TaxonomyRank1": "first",
                "TaxonomyRank2": "first",
                "TaxonomyRank3": "first",
                "TaxonomyRank4": "first",
                "TaxonomySymbol": "first"
            }
            cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec)
            dsagg = loompy.connect(out_file)

            logging.info("Computing auto-annotation")
            aa = cg.AutoAnnotator(root=am.paths().autoannotation)
            aa.annotate_loom(dsagg)
            aa.save_in_loom(dsagg)

            logging.info("Computing auto-auto-annotation")
            n_clusters = dsagg.shape[1]
            (selected, selectivity, specificity,
             robustness) = cg.AutoAutoAnnotator(
                 n_genes=self.n_auto_genes).fit(dsagg)
            dsagg.set_attr("MarkerGenes",
                           np.array([
                               " ".join(ds.ra.Gene[selected[:, ix]])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            np.set_printoptions(precision=1, suppress=True)
            dsagg.set_attr("MarkerSelectivity",
                           np.array([
                               str(selectivity[:, ix])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            dsagg.set_attr("MarkerSpecificity",
                           np.array([
                               str(specificity[:, ix])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            dsagg.set_attr("MarkerRobustness",
                           np.array([
                               str(robustness[:, ix])
                               for ix in np.arange(n_clusters)
                           ]),
                           axis=1)
            dsagg.close()
Example #6
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            ds = loompy.connect(self.input().fn)
            spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "_Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean",
                "Bucket": "mode",
                "Region": "first",
                "OriginalClusters": "first",
                "LeafOrder": "first",
                "Probable_location": "first",
                "Developmental_compartment": "first",
                "Description": "first",
                "Location_based_on": "first",
                "Neurotransmitter": "first",
                "LeafOrder": "first",
                "Comment": "first",
                "ClusterName": "first",
                "TaxonomyRank1": "first",
                "TaxonomyRank2": "first",
                "TaxonomyRank3": "first",
                "TaxonomyRank4": "first",
                "TaxonomySymbol": "first"
            }
            cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec)

            with loompy.connect(out_file) as dsagg:
                logging.info(
                    "Finding non-neuronal, housekeeping, and troublemaking genes"
                )
                (nng, blocked) = _gene_selection_L5(dsagg)

                logging.info("Manifold learning on the aggregate file")
                normalizer = cg.Normalizer(False)
                normalizer.fit(dsagg)
                pca = cg.PCAProjection(np.arange(dsagg.shape[1] * 10),
                                       max_n_components=50)
                pca.fit(dsagg, normalizer)
                transformed = pca.transform(dsagg, normalizer)
                k = 40
                bnn = cg.BalancedKNN(k=k, maxl=2 * k)
                bnn.fit(transformed)
                knn = bnn.kneighbors(mode='connectivity')[1][:, 1:]
                n_cells = knn.shape[0]
                a = np.tile(np.arange(n_cells), k)
                b = np.reshape(knn.T, (n_cells * k, ))
                w = np.repeat(1 / np.power(np.arange(1, k + 1), 1.8), n_cells)
                knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
                threshold = w > 0.025
                mknn = sparse.coo_matrix(
                    (w[threshold], (a[threshold], b[threshold])),
                    shape=(n_cells, n_cells))
                mknn = mknn.minimum(mknn.transpose()).tocoo()
                tsne = cg.TSNE(perplexity=5).layout(transformed)
                dsagg.col_graphs.KNN = knn
                dsagg.col_graphs.MKNN = mknn
                dsagg.ca._X = tsne[:, 0]
                dsagg.ca._Y = tsne[:, 1]

                logging.info("Manifold learning on all cells")
                init = np.zeros((ds.shape[1], 2))
                for lbl in np.unique(ds.ca.Clusters):
                    init[ds.ca.Clusters ==
                         lbl, :] = tsne[lbl, :] + np.random.normal(size=(
                             (ds.ca.Clusters == lbl).sum(), 2))
                ml = cg.ManifoldLearning2(gtsne=True, alpha=1, max_iter=3000)
                (knn, mknn, tsne) = ml.fit(ds,
                                           initial_pos=init,
                                           nng=nng,
                                           blocked_genes=blocked)
                ds.col_graphs.KNN = knn
                ds.col_graphs.MKNN = mknn
                ds.ca._X = tsne[:, 0]
                ds.ca._Y = tsne[:, 1]

                logging.info("Computing auto-annotation")
                aa = cg.AutoAnnotator(root="../auto-annotation/Adolescent/")
                aa.annotate_loom(dsagg)
                aa.save_in_loom(dsagg)

                logging.info("Computing auto-auto-annotation")
                n_clusters = dsagg.shape[1]
                (selected, selectivity, specificity,
                 robustness) = cg.AutoAutoAnnotator(n_genes=6).fit(dsagg)
                dsagg.set_attr("MarkerGenes",
                               np.array([
                                   " ".join(ds.ra.Gene[selected[:, ix]])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                np.set_printoptions(precision=1, suppress=True)
                dsagg.set_attr("MarkerSelectivity",
                               np.array([
                                   str(selectivity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerSpecificity",
                               np.array([
                                   str(specificity[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
                dsagg.set_attr("MarkerRobustness",
                               np.array([
                                   str(robustness[:, ix])
                                   for ix in np.arange(n_clusters)
                               ]),
                               axis=1)
Example #7
0
    def run(self) -> None:
        logging = cg.logging(self)
        with self.output().temporary_path() as out_file:
            logging.info("Aggregating loom file")
            with loompy.connect(self.input().fn) as ds:
                cg.Aggregator().aggregate(ds, out_file)
                with loompy.connect(out_file) as dsagg:
                    for ix, score in enumerate(
                            dsagg.col_attrs["ClusterScore"]):
                        logging.info(f"Cluster {ix} score {score:.1f}")

                    logging.info("Computing auto-annotation")
                    aa = cg.AutoAnnotator(root=am.paths().autoannotation)
                    aa.annotate_loom(dsagg)
                    aa.save_in_loom(dsagg)

                    logging.info("Computing auto-auto-annotation")
                    n_clusters = dsagg.shape[1]
                    (selected, selectivity, specificity,
                     robustness) = cg.AutoAutoAnnotator(
                         n_genes=self.n_auto_genes).fit(dsagg)
                    dsagg.set_attr("MarkerGenes",
                                   np.array([
                                       " ".join(ds.ra.Gene[selected[:, ix]])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)
                    np.set_printoptions(precision=1, suppress=True)
                    dsagg.set_attr("MarkerSelectivity",
                                   np.array([
                                       str(selectivity[:, ix])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)
                    dsagg.set_attr("MarkerSpecificity",
                                   np.array([
                                       str(specificity[:, ix])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)
                    dsagg.set_attr("MarkerRobustness",
                                   np.array([
                                       str(robustness[:, ix])
                                       for ix in np.arange(n_clusters)
                                   ]),
                                   axis=1)

                    tissue = self.tissue
                    labels = ds.col_attrs["Clusters"]

                    if self.tissue is "All":
                        dsagg.ca.Bucket = np.array([self.major_class] *
                                                   dsagg.shape[1])
                    else:
                        # Figure out which cells should be collected
                        cells: List[int] = []
                        # clusters_seen: List[int] = []  # Clusters for which there was some schedule
                        clusters_seen: Dict[int, str] = {}
                        schedule = pooling_schedule_L3[self.tissue]

                        # Where to send clusters when no rules match
                        _default_schedule: str = None
                        for aa_tag, sendto in schedule:
                            if aa_tag == "*":
                                _default_schedule = sendto

                        # For each cluster in the tissue
                        bucket_list = []
                        for ix, agg_aa in enumerate(dsagg.ca.AutoAnnotation):
                            # For each rule in the schedule
                            for aa_tag, sendto in schedule:
                                if aa_tag in agg_aa.split(","):
                                    if ix in clusters_seen:
                                        logging.info(
                                            f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto} (overruled by '{clusters_seen[ix]}')"
                                        )
                                    else:
                                        clusters_seen[
                                            ix] = f"{aa_tag} -> {sendto}"
                                        logging.info(
                                            f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto}"
                                        )
                                        bucket_list.append(sendto)
                            if ix not in clusters_seen:
                                if _default_schedule is None:
                                    logging.info(
                                        f"{tissue}/{ix}/{agg_aa}: No matching rule"
                                    )
                                    bucket_list.append("Excluded")
                                else:
                                    clusters_seen[
                                        ix] = f"{aa_tag} -> {_default_schedule}"
                                    logging.info(
                                        f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {_default_schedule}"
                                    )
                                    bucket_list.append(_default_schedule)
                        dsagg.ca.Bucket = np.array(bucket_list)