def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") with loompy.connect(self.input().fn) as ds: cg.Aggregator(self.n_markers).aggregate(ds, out_file) with loompy.connect(out_file) as dsagg: for ix, score in enumerate(dsagg.col_attrs["ClusterScore"]): logging.info(f"Cluster {ix} score {score:.1f}") logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator( n_genes=self.n_auto_genes).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1)
def aggregate_export(self) -> None: # Aggregate and compute enrichment, trinaries etc. logging.info("Aggregating loom file") ds_training = loompy.connect( os.path.join(self.classified_dir, "classified.loom")) classes = ds_training.col_attrs["SubclassAssigned"] ds_training.ca.Clusters = LabelEncoder().fit_transform(classes) out_file = os.path.join(self.classified_dir, "classified.agg.loom") cg.Aggregator(10).aggregate(ds_training, out_file) with loompy.connect(out_file) as dsagg: logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=cg.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator(n_genes=6).fit(dsagg) dsagg.ca.MarkerGenes = np.array([ " ".join(ds_training.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]) np.set_printoptions(precision=1, suppress=True) dsagg.ca.MarkerSelectivity = np.array( [str(selectivity[:, ix]) for ix in np.arange(n_clusters)]) dsagg.ca.MarkerSpecificity = np.array( [str(specificity[:, ix]) for ix in np.arange(n_clusters)]) dsagg.ca.MarkerRobustness = np.array( [str(robustness[:, ix]) for ix in np.arange(n_clusters)]) out_dir = os.path.join(self.classified_dir, "classified_exported") logging.info("Exporting cluster data") if not os.path.exists(out_dir): os.mkdir(out_dir) with loompy.connect(out_file) as dsagg: dsagg.export(os.path.join(out_dir, "classified_expression.tab")) dsagg.export(os.path.join(out_dir, "classified_enrichment.tab"), layer="enrichment") dsagg.export(os.path.join(out_dir, "classified_enrichment_q.tab"), layer="enrichment_q") dsagg.export(os.path.join(out_dir, "classified_trinaries.tab"), layer="trinaries")
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") ds = loompy.connect(self.input().fn) spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "_Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean", "Bucket": "mode", "Region": "first", "OriginalClusters": "first", "Probable_location": "first", "Developmental_compartment": "first", "Description": "first", "Location_based_on": "first", "Neurotransmitter": "first", "LeafOrder": "first", "Comment": "first", "ClusterName": "first", "TaxonomyRank1": "first", "TaxonomyRank2": "first", "TaxonomyRank3": "first", "TaxonomyRank4": "first", "TaxonomySymbol": "first" } cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec) dsagg = loompy.connect(out_file) logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator( n_genes=self.n_auto_genes).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.close()
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") ds = loompy.connect(self.input().fn) spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "_Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean", "Bucket": "mode", "Region": "first", "OriginalClusters": "first", "LeafOrder": "first", "Probable_location": "first", "Developmental_compartment": "first", "Description": "first", "Location_based_on": "first", "Neurotransmitter": "first", "LeafOrder": "first", "Comment": "first", "ClusterName": "first", "TaxonomyRank1": "first", "TaxonomyRank2": "first", "TaxonomyRank3": "first", "TaxonomyRank4": "first", "TaxonomySymbol": "first" } cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec) with loompy.connect(out_file) as dsagg: logging.info( "Finding non-neuronal, housekeeping, and troublemaking genes" ) (nng, blocked) = _gene_selection_L5(dsagg) logging.info("Manifold learning on the aggregate file") normalizer = cg.Normalizer(False) normalizer.fit(dsagg) pca = cg.PCAProjection(np.arange(dsagg.shape[1] * 10), max_n_components=50) pca.fit(dsagg, normalizer) transformed = pca.transform(dsagg, normalizer) k = 40 bnn = cg.BalancedKNN(k=k, maxl=2 * k) bnn.fit(transformed) knn = bnn.kneighbors(mode='connectivity')[1][:, 1:] n_cells = knn.shape[0] a = np.tile(np.arange(n_cells), k) b = np.reshape(knn.T, (n_cells * k, )) w = np.repeat(1 / np.power(np.arange(1, k + 1), 1.8), n_cells) knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells)) threshold = w > 0.025 mknn = sparse.coo_matrix( (w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells)) mknn = mknn.minimum(mknn.transpose()).tocoo() tsne = cg.TSNE(perplexity=5).layout(transformed) dsagg.col_graphs.KNN = knn dsagg.col_graphs.MKNN = mknn dsagg.ca._X = tsne[:, 0] dsagg.ca._Y = tsne[:, 1] logging.info("Manifold learning on all cells") init = np.zeros((ds.shape[1], 2)) for lbl in np.unique(ds.ca.Clusters): init[ds.ca.Clusters == lbl, :] = tsne[lbl, :] + np.random.normal(size=( (ds.ca.Clusters == lbl).sum(), 2)) ml = cg.ManifoldLearning2(gtsne=True, alpha=1, max_iter=3000) (knn, mknn, tsne) = ml.fit(ds, initial_pos=init, nng=nng, blocked_genes=blocked) ds.col_graphs.KNN = knn ds.col_graphs.MKNN = mknn ds.ca._X = tsne[:, 0] ds.ca._Y = tsne[:, 1] logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root="../auto-annotation/Adolescent/") aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator(n_genes=6).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1)
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") with loompy.connect(self.input().fn) as ds: cg.Aggregator().aggregate(ds, out_file) with loompy.connect(out_file) as dsagg: for ix, score in enumerate( dsagg.col_attrs["ClusterScore"]): logging.info(f"Cluster {ix} score {score:.1f}") logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator( n_genes=self.n_auto_genes).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) tissue = self.tissue labels = ds.col_attrs["Clusters"] if self.tissue is "All": dsagg.ca.Bucket = np.array([self.major_class] * dsagg.shape[1]) else: # Figure out which cells should be collected cells: List[int] = [] # clusters_seen: List[int] = [] # Clusters for which there was some schedule clusters_seen: Dict[int, str] = {} schedule = pooling_schedule_L3[self.tissue] # Where to send clusters when no rules match _default_schedule: str = None for aa_tag, sendto in schedule: if aa_tag == "*": _default_schedule = sendto # For each cluster in the tissue bucket_list = [] for ix, agg_aa in enumerate(dsagg.ca.AutoAnnotation): # For each rule in the schedule for aa_tag, sendto in schedule: if aa_tag in agg_aa.split(","): if ix in clusters_seen: logging.info( f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto} (overruled by '{clusters_seen[ix]}')" ) else: clusters_seen[ ix] = f"{aa_tag} -> {sendto}" logging.info( f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto}" ) bucket_list.append(sendto) if ix not in clusters_seen: if _default_schedule is None: logging.info( f"{tissue}/{ix}/{agg_aa}: No matching rule" ) bucket_list.append("Excluded") else: clusters_seen[ ix] = f"{aa_tag} -> {_default_schedule}" logging.info( f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {_default_schedule}" ) bucket_list.append(_default_schedule) dsagg.ca.Bucket = np.array(bucket_list)