def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: with loompy.connect(self.input().fn, mode="r") as ds: logging.info("Collecting valid cells") for (ix, selection, view) in ds.scan( items=np.where(ds.col_attrs["_Valid"] == 1)[0], axis=1, key="Accession"): loompy.create_append(out_file, view.layers, view.ra, view.ca) with loompy.connect(out_file) as ds: logging.info(f"Found {ds.shape[1]} valid cells") logging.info("Learning the manifold") cg.Cytograph2(accel=self.accel, log=self.log, normalize=self.normalize, a=self.a, b=self.b, c=self.c, d=self.d, k=self.k, k_smoothing=self.k_smoothing, n_factors=self.n_factors, max_iter=200).fit(ds)
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: with loompy.connect(self.input().fn) as ds: for (ix, selection, view) in ds.scan( items=np.where(ds.col_attrs["_Valid"] == 1)[0], axis=1, key="Accession"): loompy.create_append(out_file, view.layers, view.ra, view.ca) with loompy.connect(out_file) as ds: logging.info("Learning the manifold") ml = cg.ManifoldLearning2( n_genes=self.n_genes, gtsne=self.gtsne, alpha=self.alpha, filter_cellcycle=self.filter_cellcycle, layer=self.layer) (knn, mknn, tsne) = ml.fit(ds) ds.col_graphs.KNN = knn ds.col_graphs.MKNN = mknn ds.ca._X = tsne[:, 0] ds.ca._Y = tsne[:, 1] logging.info("Clustering on the manifold L1") pl = cg.PolishedLouvain() labels = pl.fit_predict(ds) ds.ca.Clusters = labels + 1 ds.ca.Outliers = (labels == -1).astype('int') logging.info(f"Found {labels.max() + 1} clusters")
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: with loompy.connect(self.input().fn) as ds: cells = np.where( ds.ca[f"TaxonomyRank{self.rank}"] == self.taxon)[0] if cells.sum() == 0: raise ValueError(f"No cells found in taxon {self.taxon}!") for (ix, selection, view) in ds.scan(items=cells, axis=1, key="Accession"): loompy.create_append(out_file, view.layers, view.ra, view.ca) logging.info("Renumbering the clusters") with loompy.connect(out_file) as dsout: # Renumber the clusters dsout.ca.Clusters = LabelEncoder().fit_transform( dsout.ca.Clusters) logging.info("Recomputing the list of valid genes") nnz = dsout.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 10, nnz < dsout.shape[1] * 0.6) dsout.ra._Valid = valid_genes.astype('int') logging.info("Learning the manifold") ml = cg.ManifoldLearning2(gtsne=True, alpha=1) (knn, mknn, tsne) = ml.fit(dsout) dsout.col_graphs.KNN = knn dsout.col_graphs.MKNN = mknn dsout.ca._X = tsne[:, 0] dsout.ca._Y = tsne[:, 1]
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: copyfile(self.input().fn, out_file) with loompy.connect(out_file) as ds: labels = ds.ca[f"TaxonomyRank{self.rank}"] le = LabelEncoder() new_clusters = le.fit_transform(labels) ds.ca.Clusters = new_clusters
def run(self) -> None: logging = cg.logging(self, True) with self.output().temporary_path() as out_dir: logging.info("Exporting cluster data") if not os.path.exists(out_dir): os.mkdir(out_dir) with loompy.connect(self.input()[0].fn) as dsagg: dsagg.export( os.path.join(out_dir, f"L6_R{self.rank}_expression.tab")) dsagg.export(os.path.join(out_dir, f"L6_R{self.rank}_enrichment.tab"), layer="enrichment") dsagg.export(os.path.join(out_dir, f"L6_R{self.rank}_enrichment_q.tab"), layer="enrichment_q") dsagg.export(os.path.join(out_dir, f"L6_R{self.rank}_trinaries.tab"), layer="trinaries") logging.info("Plotting manifold graph with auto-annotation") with loompy.connect(self.input()[1].fn) as ds: cg.plot_graph( ds, os.path.join(out_dir, f"L6_R{self.rank}_manifold.aa.png"), list(dsagg.ca.AutoAnnotation)) logging.info( "Plotting manifold graph with auto-auto-annotation") cg.plot_graph( ds, os.path.join(out_dir, f"L6_R{self.rank}_manifold.aaa.png"), list(dsagg.ca.MarkerGenes)) logging.info("Plotting manifold graph with taxon names") cg.plot_graph( ds, os.path.join(out_dir, f"L6_R{self.rank}_manifold.names.png"), list(dsagg.ca[f"TaxonomyRank{self.rank}"])) logging.info("Plotting marker heatmap") cg.plot_markerheatmap(ds, dsagg, n_markers_per_cluster=self.n_markers, out_file=os.path.join( out_dir, f"L6_R{self.rank}_heatmap.pdf"))
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") with loompy.connect(self.input().fn) as ds: cg.Aggregator(self.n_markers).aggregate(ds, out_file) with loompy.connect(out_file) as dsagg: for ix, score in enumerate(dsagg.col_attrs["ClusterScore"]): logging.info(f"Cluster {ix} score {score:.1f}") logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator( n_genes=self.n_auto_genes).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1)
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: for clustered in self.input(): with loompy.connect(clustered.fn, "r") as ds: logging.info("Split/pool from " + clustered.fn) cells = np.where(ds.ca.Class == self.major_class)[0] if self.major_class == "Oligos": # Special selection of cells for the oligo class, to balance between tissues enough_genes = ds.map( (np.count_nonzero, ), axis=1)[0] > 1000 has_pdgfra = ds[ds.ra.Gene == "Pdgfra", :][0] > 0 has_meg3 = ds[ds.ra.Gene == "Meg3", :][0] > 0 is_doublet = np.zeros(ds.shape[1], dtype='bool') for g in [ 'Stmn2', 'Aqp4', 'Gja1', 'C1qc', 'Aif1', 'Cldn5', 'Fn1', 'Hbb-bt', 'Hbb-bh1', 'Hbb-bh2', 'Hbb-y', 'Hbb-bs', 'Hba-a1', 'Hba-a2', 'Hba-x' ]: is_doublet = np.logical_or( is_doublet, ds[ds.ra.Gene == g, :][0] > 0) ok_cells = enough_genes & (~is_doublet) & (has_pdgfra | ~has_meg3) cells = np.intersect1d(cells, np.where(ok_cells)[0]) if cells.shape[0] > 5000: cells = np.random.choice(cells, 5000, False) for (_, _, view) in ds.scan(items=cells, axis=1, key="Accession"): loompy.create_append(out_file, view.layers, view.ra, view.ca) else: for (_, _, view) in ds.scan(items=cells, axis=1, key="Accession"): loompy.create_append(out_file, view.layers, view.ra, view.ca) with loompy.connect(out_file) as ds: logging.info(f"Found {ds.shape[1]} valid cells") logging.info("Learning the manifold") cg.Cytograph2(max_iter=100).fit(ds)
def run(self) -> None: logging = cg.logging(self, True) logging.info("Exporting cluster data") with self.output().temporary_path() as out_dir: if not os.path.exists(out_dir): os.mkdir(out_dir) with loompy.connect(self.input()[0].fn) as dsagg: logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_expression.tab")) dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_enrichment.tab"), layer="enrichment") dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_enrichment_q.tab"), layer="enrichment_q") dsagg.export(os.path.join(out_dir, "L1_" + self.tissue + "_trinaries.tab"), layer="trinaries") ds = loompy.connect(self.input()[1].fn) logging.info("Plotting MKNN graph") cg.plot_knn(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.mknn.png")) # logging.info("Plotting Louvain resolution") # cg.plot_louvain(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.louvain.png")) try: logging.info("Plotting manifold graph with classes") cg.plot_classes(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.classes.png")) except Exception: pass logging.info("Plotting manifold graph with auto-annotation") tags = list(dsagg.col_attrs["AutoAnnotation"]) cg.plot_graph(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.aa.png"), tags) logging.info("Plotting manifold graph with auto-auto-annotation") tags = list(dsagg.col_attrs["MarkerGenes"]) cg.plot_graph(ds, os.path.join(out_dir, "L1_" + self.tissue + "_manifold.aaa.png"), tags) logging.info("Plotting marker heatmap") cg.plot_markerheatmap(ds, dsagg, n_markers_per_cluster=self.n_markers, out_file=os.path.join(out_dir, "L1_" + self.tissue + "_heatmap.pdf"))
def run(self) -> None: logging = cg.logging(self, True) with self.output().temporary_path() as out_dir: logging.info("Exporting cluster data") if not os.path.exists(out_dir): os.mkdir(out_dir) with loompy.connect(self.input()[0].fn) as dsagg: logging.info("Exporting tab files") dsagg.export( os.path.join( out_dir, "L2_" + self.major_class + "_" + self.tissue + "_expression.tab")) dsagg.export(os.path.join( out_dir, "L2_" + self.major_class + "_" + self.tissue + "_enrichment.tab"), layer="enrichment") dsagg.export(os.path.join( out_dir, "L2_" + self.major_class + "_" + self.tissue + "_enrichment_q.tab"), layer="enrichment_q") dsagg.export(os.path.join( out_dir, "L2_" + self.major_class + "_" + self.tissue + "_trinaries.tab"), layer="trinaries") logging.info("Plotting manifold graph with auto-annotation") tags = list(dsagg.col_attrs["AutoAnnotation"]) with loompy.connect(self.input()[1].fn) as ds: cg.plot_graph( ds, os.path.join( out_dir, "L2_" + self.major_class + "_" + self.tissue + "_manifold.aa.png"), tags) logging.info( "Plotting manifold graph with auto-auto-annotation") tags = list(dsagg.col_attrs["MarkerGenes"][np.argsort( dsagg.col_attrs["Clusters"])]) cg.plot_graph( ds, os.path.join( out_dir, "L2_" + self.major_class + "_" + self.tissue + "_manifold.aaa.png"), tags) logging.info("Plotting manifold graph with classes") cg.plot_classes( ds, os.path.join( out_dir, "L2_" + self.major_class + "_" + self.tissue + "_manifold.classes.png")) logging.info("Plotting marker heatmap") cg.plot_markerheatmap(ds, dsagg, n_markers_per_cluster=10, out_file=os.path.join( out_dir, "L2_" + self.major_class + "_" + self.tissue + "_heatmap.pdf")) logging.info("Plotting latent factors") cg.plot_factors(ds, base_name=os.path.join( out_dir, "L2_" + self.major_class + "_" + self.tissue + "_factors"))
def run(self) -> None: logging = cg.logging(self) samples = [x.fn for x in self.input()] max_cluster_id = 0 cluster_ids: List[int] = [] original_ids: List[int] = [] samples_per_cell: List[str] = [] celltypes_summary_file = os.path.join( am.paths().build, "curated_L4", "celltypes_summary_leaforder16-Dec-2017.xlsx") celltypes_summary = pd.read_excel(celltypes_summary_file) celltypes_dict = { celltypes_summary.columns.values[i]: celltypes_summary.values[:, i] for i in range(celltypes_summary.shape[1]) } with self.output().temporary_path() as out_file: accessions = None # type: np.ndarray for sample in samples: with loompy.connect(sample) as ds: logging.info(f"Adding {ds.shape[1]} cells from {sample}") target = os.path.basename(sample)[3:-5] not_excluded = celltypes_dict["OriginalCluster"][ celltypes_dict["Bucket"] == target] cells = np.where(np.isin(ds.ca.Clusters, not_excluded))[0] for (ix, selection, view) in ds.scan(items=cells, axis=1, key="Accession"): cluster_ids += list(view.ca.Clusters + max_cluster_id) original_ids += list(view.ca.Clusters) samples_per_cell += [sample] * selection.shape[0] loompy.create_append(out_file, view.layers, view.ra, view.ca, fill_values="auto") max_cluster_id = max(cluster_ids) + 1 logging.info(f"Found {max_cluster_id} clusters total") with loompy.connect(out_file) as ds: ds.ca.Clusters = np.array(cluster_ids) ds.ca.OriginalClusters = np.array(original_ids) ds.ca.Bucket = np.array(samples_per_cell) leaf_order = np.zeros(ds.shape[1], dtype='int') - 1 le = LabelEncoder() le.fit(celltypes_dict["ClusterName"]) new_clusters = np.zeros(ds.shape[1], dtype='int') - 1 d = {} for attr in [ "LeafOrder", "Probable_location", "Developmental_compartment", "Region", "Description", "Location_based_on", "Neurotransmitter", "ClusterName", "Taxonomy_group", "Comment", "ClusterName" ]: d[attr] = np.array([""] * ds.shape[1], dtype=object) for ix in range(len(celltypes_dict["Bucket"])): bucket = celltypes_dict["Bucket"][ix] bucket_name = f"/Users/sten/build_20171205/L4_{bucket}.loom" original_cluster = celltypes_dict["OriginalCluster"][ix] cells = np.logical_and( ds.ca.Bucket == bucket_name, ds.ca.OriginalClusters == original_cluster) leaf_order[cells] = celltypes_dict["LeafOrder"][ix] new_clusters[cells] = le.transform( [celltypes_dict["ClusterName"][ix]]) for attr in d.keys(): d[attr][cells] = celltypes_dict[attr][ix] logging.info(f"Found {new_clusters.max() + 1} clusters total") ds.ca.Clusters = new_clusters ds.ca.LeafOrder = leaf_order for key, vals in d.items(): ds.ca[key] = vals.astype("unicode") taxonomy_file = os.path.join(am.paths().build, "curated_L4", "Taxonomy.xlsx") taxonomy_table = pd.read_excel(taxonomy_file) taxonomy = { taxonomy_table.values[i, 3]: taxonomy_table.values[i, :] for i in range(taxonomy_table.shape[0]) } tax1 = np.array([""] * ds.shape[1], dtype=object) tax2 = np.array([""] * ds.shape[1], dtype=object) tax3 = np.array([""] * ds.shape[1], dtype=object) tax4 = np.array([""] * ds.shape[1], dtype=object) taxs = np.array([""] * ds.shape[1], dtype=object) for i in range(ds.shape[1]): if ds.ca.Clusters[i] == -1: continue tax1[i] = taxonomy[d["Taxonomy_group"][i]][0] tax2[i] = taxonomy[d["Taxonomy_group"][i]][1] tax3[i] = taxonomy[d["Taxonomy_group"][i]][2] tax4[i] = taxonomy[d["Taxonomy_group"][i]][3] taxs[i] = taxonomy[d["Taxonomy_group"][i]][4] ds.ca.TaxonomyRank1 = tax1 ds.ca.TaxonomyRank2 = tax2 ds.ca.TaxonomyRank3 = tax3 ds.ca.TaxonomyRank4 = tax4 ds.ca.TaxonomySymbol = taxs logging.info("Recomputing the list of valid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6) ds.ra._Valid = valid_genes.astype('int') logging.info("Learning the manifold") ml = cg.ManifoldLearning2(gtsne=True, alpha=1, max_iter=3000) (knn, mknn, tsne) = ml.fit(ds) ds.col_graphs.KNN = knn ds.col_graphs.MKNN = mknn ds.ca._X = tsne[:, 0] ds.ca._Y = tsne[:, 1]
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") ds = loompy.connect(self.input().fn) spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "_Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean", "Bucket": "mode", "Region": "first", "OriginalClusters": "first", "Probable_location": "first", "Developmental_compartment": "first", "Description": "first", "Location_based_on": "first", "Neurotransmitter": "first", "LeafOrder": "first", "Comment": "first", "ClusterName": "first", "TaxonomyRank1": "first", "TaxonomyRank2": "first", "TaxonomyRank3": "first", "TaxonomyRank4": "first", "TaxonomySymbol": "first" } cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec) dsagg = loompy.connect(out_file) logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator( n_genes=self.n_auto_genes).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.close()
def run(self) -> None: logging = cg.logging(self, True) with self.output().temporary_path() as out_dir: logging.info("Exporting cluster data") if not os.path.exists(out_dir): os.mkdir(out_dir) with loompy.connect(self.input()[0].fn) as dsagg: with open( os.path.join(out_dir, "L5_All_taxon_enrichment_0.2.txt"), 'w') as f: logging.info("Computing taxon enrichment") for rank in [1, 2, 3, 4]: taxa = list(set(dsagg.ca[f"TaxonomyRank{rank}"])) for taxon in taxa: gix = np.where( np.all( dsagg["trinaries"] [:, dsagg.ca[f"TaxonomyRank{rank}"] == taxon] > 0.999, axis=1))[0] non_group_mean = np.mean( dsagg["trinaries"][gix, :] [:, dsagg.ca[f"TaxonomyRank{rank}"] != taxon], axis=1) genes = dsagg.ra.Gene[gix[np.argsort( non_group_mean)]][0:20] f.write( str(rank) + " " + taxon + "\t" + "\t".join(genes) + "\n") with open( os.path.join(out_dir, "L5_All_taxon_enrichment_0.05.txt"), 'w') as f: logging.info("Computing taxon enrichment") for rank in [1, 2, 3, 4]: taxa = list(set(dsagg.ca[f"TaxonomyRank{rank}"])) for taxon in taxa: gix = np.where( np.all( dsagg["trinaries_0.05"] [:, dsagg.ca[f"TaxonomyRank{rank}"] == taxon] > 0.999, axis=1))[0] non_group_mean = np.mean( dsagg["trinaries_0.05"][gix, :] [:, dsagg.ca[f"TaxonomyRank{rank}"] != taxon], axis=1) genes = dsagg.ra.Gene[gix[np.argsort( non_group_mean)]][0:20] f.write( str(rank) + " " + taxon + "\t" + "\t".join(genes) + "\n") dsagg.export(os.path.join(out_dir, "L5_All_expression.tab")) dsagg.export(os.path.join(out_dir, "L5_All_enrichment.tab"), layer="enrichment") dsagg.export(os.path.join(out_dir, "L5_All_enrichment_q.tab"), layer="enrichment_q") dsagg.export(os.path.join(out_dir, "L5_All_trinaries.tab"), layer="trinaries") logging.info("Plotting all cells t-SNE") with loompy.connect(os.path.join(out_dir, self.input()[1].fn)) as ds: fig = plt.figure(figsize=(3, 3)) ax = fig.add_axes([0, 0, 1, 1]) ax.axis('off') colors = cg.colorize(np.arange(52)) ix = 0 for taxon in np.unique(ds.ca.TaxonomyRank3): cells = ds.ca.TaxonomyRank3 == taxon plt.scatter(x=ds.ca._X[cells], y=ds.ca._Y[cells], s=10, c=colors[ix, :], marker='.', label=taxon, alpha=0.3, lw=0) ix += 1 lgnd = ax.legend(fontsize=10, labelspacing=0.2, loc="upper left", bbox_to_anchor=(1, 1), frameon=False) for handle in lgnd.legendHandles: handle.set_sizes([250]) handle.set_alpha(1) plt.savefig(os.path.join(out_dir, "L5_All.png"), dpi=600, transparent=True, bbox_extra_artists=(lgnd, ), bbox_inches='tight') plt.close()
def run(self) -> None: logging = cg.logging(self, True) dsout: loompy.LoomConnection = None accessions: loompy.LoomConnection = None with self.output().temporary_path() as out_file: logging.info("Gathering cells for " + self.target) enriched_markers: List[np.ndarray] = [ ] # The enrichment vector for each selected cluster cells_found = False for in_file, agg_file in self.input(): tissue = os.path.basename( in_file.fn).split("_")[2].split(".")[0] ds = loompy.connect(in_file.fn) dsagg = loompy.connect(agg_file.fn) enrichment = dsagg.layer["enrichment"][:, :] labels = ds.col_attrs["Clusters"] ordering: np.ndarray = None logging.info(tissue) # Figure out which cells should be collected cells: List[int] = [] for fname in os.listdir( os.path.join(am.paths().build, "curated_L2")): if not fname.startswith("L2"): continue from_tissue = fname.split("_")[2] if from_tissue != tissue: continue if tissue == "All": major_class = fname.split("_")[1] if major_class != self.target: continue logging.info("Gathering cells from " + in_file.fn) logging.info("Gathering cells based on " + fname) with open( os.path.join(am.paths().build, "curated_L2", fname)) as f: schedule = [x[:-1].split("\t") for x in f.readlines()] for (cluster_str, n_cells, auto_target, curated_target, comment) in schedule: cluster = int(cluster_str) if curated_target == self.target: if accessions is None: accessions = ds.row_attrs["Accession"] if ordering is None: ordering = np.where( ds.row_attrs["Accession"][ None, :] == accessions[:, None])[1] cells += list(np.where(labels == cluster)[0]) enriched_markers.append( np.argsort(-enrichment[:, cluster][ordering])) if len(cells) > 0: cells = np.sort(np.array(cells)) cells_found = True for (ix, selection, view) in ds.scan(items=cells, axis=1, key="Accession"): loompy.create_append(out_file, view.layers, view.ra, view.ca) if not cells_found: raise ValueError( f"No cells matched any schedule for {self.target}") # Figure out which enriched markers to use ix = 0 temp: List[int] = [] while len(temp) < self.n_enriched: for j in range(len(enriched_markers)): if enriched_markers[j][ix] not in temp: temp.append(enriched_markers[j][ix]) ix += 1 genes = np.sort(np.array(temp)) logging.info("Learning the manifold") with loompy.connect(out_file) as dsout: ml = cg.ManifoldLearning2(gtsne=True, alpha=1, genes=genes) (knn, mknn, tsne) = ml.fit(dsout) dsout.col_graphs.KNN = knn dsout.col_graphs.MKNN = mknn dsout.ca._X = tsne[:, 0] dsout.ca._Y = tsne[:, 1] logging.info("Clustering on the manifold") special_res = { "Astrocytes": 0.6, "Sensory_Neurons": 0.35, "Brain_Granule": 0.6 } r = 1.0 if self.target in special_res: r = special_res[self.target] pl = cg.PolishedLouvain(resolution=r) labels = pl.fit_predict(dsout) dsout.ca.Clusters = labels + 1 dsout.ca.Outliers = (labels == -1).astype('int') logging.info(f"Found {labels.max() + 1} clusters")
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") with loompy.connect(self.input().fn) as ds: cg.Aggregator().aggregate(ds, out_file) with loompy.connect(out_file) as dsagg: for ix, score in enumerate( dsagg.col_attrs["ClusterScore"]): logging.info(f"Cluster {ix} score {score:.1f}") logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator( n_genes=self.n_auto_genes).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) tissue = self.tissue labels = ds.col_attrs["Clusters"] if self.tissue is "All": dsagg.ca.Bucket = np.array([self.major_class] * dsagg.shape[1]) else: # Figure out which cells should be collected cells: List[int] = [] # clusters_seen: List[int] = [] # Clusters for which there was some schedule clusters_seen: Dict[int, str] = {} schedule = pooling_schedule_L3[self.tissue] # Where to send clusters when no rules match _default_schedule: str = None for aa_tag, sendto in schedule: if aa_tag == "*": _default_schedule = sendto # For each cluster in the tissue bucket_list = [] for ix, agg_aa in enumerate(dsagg.ca.AutoAnnotation): # For each rule in the schedule for aa_tag, sendto in schedule: if aa_tag in agg_aa.split(","): if ix in clusters_seen: logging.info( f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto} (overruled by '{clusters_seen[ix]}')" ) else: clusters_seen[ ix] = f"{aa_tag} -> {sendto}" logging.info( f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {sendto}" ) bucket_list.append(sendto) if ix not in clusters_seen: if _default_schedule is None: logging.info( f"{tissue}/{ix}/{agg_aa}: No matching rule" ) bucket_list.append("Excluded") else: clusters_seen[ ix] = f"{aa_tag} -> {_default_schedule}" logging.info( f"{tissue}/{ix}/{agg_aa}: {aa_tag} -> {_default_schedule}" ) bucket_list.append(_default_schedule) dsagg.ca.Bucket = np.array(bucket_list)
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_dir: logging.info("Exporting cluster data") if not os.path.exists(out_dir): os.mkdir(out_dir) dsagg = loompy.connect(self.input()[0].fn) logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root=am.paths().autoannotation) aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) dsagg.export( os.path.join(out_dir, "L3_" + self.target + "_expression.tab")) dsagg.export(os.path.join(out_dir, "L3_" + self.target + "_enrichment.tab"), layer="enrichment") dsagg.export(os.path.join( out_dir, "L3_" + self.target + "_enrichment_q.tab"), layer="enrichment_q") dsagg.export(os.path.join(out_dir, "L3_" + self.target + "_trinaries.tab"), layer="trinaries") logging.info("Plotting manifold graph with auto-annotation") tags = list(dsagg.col_attrs["AutoAnnotation"][np.argsort( dsagg.col_attrs["Clusters"])]) ds = loompy.connect(self.input()[1].fn) cg.plot_graph( ds, os.path.join(out_dir, "L3_" + self.target + "_manifold.aa.png"), tags) logging.info("Plotting manifold graph with auto-auto-annotation") tags = list(dsagg.col_attrs["MarkerGenes"][np.argsort( dsagg.col_attrs["Clusters"])]) cg.plot_graph( ds, os.path.join(out_dir, "L3_" + self.target + "_manifold.aaa.png"), tags) logging.info("Plotting marker heatmap") cg.plot_markerheatmap(ds, dsagg, n_markers_per_cluster=self.n_markers, out_file=os.path.join( out_dir, "L3_" + self.target + "_heatmap.pdf")) logging.info("Computing discordance distances") pep = 0.05 n_labels = dsagg.shape[1] def discordance_distance(a: np.ndarray, b: np.ndarray) -> float: """ Number of genes that are discordant with given PEP, divided by number of clusters """ return np.sum((1 - a) * b + a * (1 - b) > 1 - pep) / n_labels data = dsagg.layer["trinaries"][:n_labels * 10, :].T D = squareform(pdist(data, discordance_distance)) with open( os.path.join(out_dir, "L3_" + self.target + "_distances.txt"), "w") as f: f.write(str(np.diag(D, k=1)))
def run(self) -> None: logging = cg.logging(self, True) with self.output().temporary_path() as out_dir: logging.info("Exporting cluster data") if not os.path.exists(out_dir): os.mkdir(out_dir) with loompy.connect(self.input()[0].fn) as dsagg: dsagg.export( os.path.join( out_dir, f"L6_R{self.rank}_({self.taxon})_expression.tab")) dsagg.export(os.path.join( out_dir, f"L6_R{self.rank}_({self.taxon})_enrichment.tab"), layer="enrichment") dsagg.export(os.path.join( out_dir, f"L6_R{self.rank}_({self.taxon})_enrichment_q.tab"), layer="enrichment_q") dsagg.export(os.path.join( out_dir, f"L6_R{self.rank}_({self.taxon})_trinaries.tab"), layer="trinaries") logging.info("Plotting manifold graph with auto-annotation") with loompy.connect(self.input()[1].fn) as ds: cg.plot_graph( ds, os.path.join( out_dir, f"L6_R{self.rank}_({self.taxon})_manifold.aa.png"), list(dsagg.ca.AutoAnnotation)) logging.info( "Plotting manifold graph with auto-auto-annotation") cg.plot_graph( ds, os.path.join( out_dir, f"L6_R{self.rank}_({self.taxon})_manifold.aaa.png" ), list(dsagg.ca.MarkerGenes)) logging.info("Plotting manifold graph with cluster names") cg.plot_graph( ds, os.path.join( out_dir, f"L6_R{self.rank}_({self.taxon})_manifold.names.png" ), list(dsagg.ca.ClusterName)) logging.info("Plotting marker heatmap") cg.plot_markerheatmap( ds, dsagg, n_markers_per_cluster=self.n_markers, out_file=os.path.join( out_dir, f"L6_R{self.rank}_({self.taxon})_heatmap.pdf")) size = 200000 / ds.shape[1] fig = plt.figure(figsize=(3, 3)) ax = fig.add_axes([0, 0, 1, 1]) ax.axis('off') ix = 0 if self.rank == 3: colors = cg.colorize(np.unique(ds.ca.ClusterName)) for cluster in np.unique(ds.ca.ClusterName): cells = ds.ca.ClusterName == cluster plt.scatter(x=ds.ca._X[cells], y=ds.ca._Y[cells], s=size, c=colors[ix, :], marker='.', label=cluster, alpha=0.5, lw=0) ix += 1 else: colors = cg.colorize(np.unique(ds.ca.TaxonomyRank4)) for taxon4 in np.unique(ds.ca.TaxonomyRank4): cells = ds.ca.TaxonomyRank4 == taxon4 plt.scatter(x=ds.ca._X[cells], y=ds.ca._Y[cells], s=size, c=colors[ix, :], marker='.', label=taxon4, alpha=0.5, lw=0) ix += 1 lgnd = ax.legend(fontsize=10, labelspacing=0.2, loc="upper left", bbox_to_anchor=(1, 1), frameon=False) for handle in lgnd.legendHandles: handle.set_sizes([250]) handle.set_alpha(1) plt.savefig(os.path.join( out_dir, f"L6_R{self.rank}_({self.taxon})_manifold.pretty.png"), dpi=600, transparent=True, bbox_extra_artists=(lgnd, ), bbox_inches='tight') plt.close()
def run(self) -> None: logging = cg.logging(self) with self.output().temporary_path() as out_file: logging.info("Aggregating loom file") ds = loompy.connect(self.input().fn) spec = { "Age": "tally", "Clusters": "first", "Class": "mode", "_Total": "mean", "Sex": "tally", "Tissue": "tally", "SampleID": "tally", "TissuePool": "first", "Outliers": "mean", "Bucket": "mode", "Region": "first", "OriginalClusters": "first", "LeafOrder": "first", "Probable_location": "first", "Developmental_compartment": "first", "Description": "first", "Location_based_on": "first", "Neurotransmitter": "first", "LeafOrder": "first", "Comment": "first", "ClusterName": "first", "TaxonomyRank1": "first", "TaxonomyRank2": "first", "TaxonomyRank3": "first", "TaxonomyRank4": "first", "TaxonomySymbol": "first" } cg.Aggregator(f=[0.2, 0.05]).aggregate(ds, out_file, agg_spec=spec) with loompy.connect(out_file) as dsagg: logging.info( "Finding non-neuronal, housekeeping, and troublemaking genes" ) (nng, blocked) = _gene_selection_L5(dsagg) logging.info("Manifold learning on the aggregate file") normalizer = cg.Normalizer(False) normalizer.fit(dsagg) pca = cg.PCAProjection(np.arange(dsagg.shape[1] * 10), max_n_components=50) pca.fit(dsagg, normalizer) transformed = pca.transform(dsagg, normalizer) k = 40 bnn = cg.BalancedKNN(k=k, maxl=2 * k) bnn.fit(transformed) knn = bnn.kneighbors(mode='connectivity')[1][:, 1:] n_cells = knn.shape[0] a = np.tile(np.arange(n_cells), k) b = np.reshape(knn.T, (n_cells * k, )) w = np.repeat(1 / np.power(np.arange(1, k + 1), 1.8), n_cells) knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells)) threshold = w > 0.025 mknn = sparse.coo_matrix( (w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells)) mknn = mknn.minimum(mknn.transpose()).tocoo() tsne = cg.TSNE(perplexity=5).layout(transformed) dsagg.col_graphs.KNN = knn dsagg.col_graphs.MKNN = mknn dsagg.ca._X = tsne[:, 0] dsagg.ca._Y = tsne[:, 1] logging.info("Manifold learning on all cells") init = np.zeros((ds.shape[1], 2)) for lbl in np.unique(ds.ca.Clusters): init[ds.ca.Clusters == lbl, :] = tsne[lbl, :] + np.random.normal(size=( (ds.ca.Clusters == lbl).sum(), 2)) ml = cg.ManifoldLearning2(gtsne=True, alpha=1, max_iter=3000) (knn, mknn, tsne) = ml.fit(ds, initial_pos=init, nng=nng, blocked_genes=blocked) ds.col_graphs.KNN = knn ds.col_graphs.MKNN = mknn ds.ca._X = tsne[:, 0] ds.ca._Y = tsne[:, 1] logging.info("Computing auto-annotation") aa = cg.AutoAnnotator(root="../auto-annotation/Adolescent/") aa.annotate_loom(dsagg) aa.save_in_loom(dsagg) logging.info("Computing auto-auto-annotation") n_clusters = dsagg.shape[1] (selected, selectivity, specificity, robustness) = cg.AutoAutoAnnotator(n_genes=6).fit(dsagg) dsagg.set_attr("MarkerGenes", np.array([ " ".join(ds.ra.Gene[selected[:, ix]]) for ix in np.arange(n_clusters) ]), axis=1) np.set_printoptions(precision=1, suppress=True) dsagg.set_attr("MarkerSelectivity", np.array([ str(selectivity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerSpecificity", np.array([ str(specificity[:, ix]) for ix in np.arange(n_clusters) ]), axis=1) dsagg.set_attr("MarkerRobustness", np.array([ str(robustness[:, ix]) for ix in np.arange(n_clusters) ]), axis=1)
def run(self) -> None: logging = cg.logging(self, True) with self.output().temporary_path() as out_dir: logging.info("Exporting oligo cell types") if not os.path.exists(out_dir): os.mkdir(out_dir) with loompy.connect(self.input().fn) as ds: celltypes = ["COP1", "COP2", "NFOL2", "NFOL1", "OPC"] selected = np.array([], dtype='int') for ct in celltypes: print(ct) cells = np.where(ds.ca.ClusterName == ct)[0] if cells.shape[0] > 820: cells = np.random.choice(cells, size=820, replace=False) selected = np.union1d(selected, cells) ngfile = os.path.join(out_dir, "F_Oligos.loom") for (_, _, view) in ds.scan(items=selected, axis=1): loompy.create_append(ngfile, view.layers, view.ra, view.ca) with loompy.connect(ngfile) as ds: logging.info("Learning the manifold") ml = cg.ManifoldLearning2(gtsne=False, alpha=1) (knn, mknn, tsne) = ml.fit(ds) ds.col_graphs.KNN = knn ds.col_graphs.MKNN = mknn ds.ca._X = tsne[:, 0] ds.ca._Y = tsne[:, 1] fig = plt.figure(figsize=(3, 3)) ax = fig.add_axes([0, 0, 1, 1]) lc = LineCollection(zip(tsne[mknn.row], tsne[mknn.col]), linewidths=0.25, zorder=0, color='grey', alpha=0.1) ax.add_collection(lc) ax.axis('off') colors = cg.colorize(np.unique(ds.ca.ClusterName)) ix = 0 for ct in np.unique(ds.ca.ClusterName): cells = (ds.ca.ClusterName == ct) plt.scatter(x=ds.ca._X[cells], y=ds.ca._Y[cells], s=40, c=colors[ix, :], marker='.', label=ct, alpha=0.5, lw=0) ix += 1 lgnd = ax.legend(fontsize=10, labelspacing=0.2, loc="upper left", bbox_to_anchor=(1, 1), frameon=False) for handle in lgnd.legendHandles: handle.set_sizes([250]) handle.set_alpha(1) plt.savefig(os.path.join(out_dir, "Fig_Oligos_Types.png"), dpi=600, transparent=True, bbox_extra_artists=(lgnd, ), bbox_inches='tight') plt.close() fig = plt.figure(figsize=(3, 3)) ax = fig.add_axes([0, 0, 1, 1]) ax.axis('off') plt.scatter(x=ds.ca._X, y=ds.ca._Y, s=40, c=cg.colors75[(ds[ds.ra.Gene == "Cdk1", :][0] != 0).astype('int')], marker='.', label=ct, alpha=0.5, lw=0) plt.savefig(os.path.join(out_dir, "Fig_Oligos_Cdk1.png"), dpi=600, transparent=True, bbox_inches='tight')
def run(self) -> None: logging = cg.logging(self) dsout = None # type: loompy.LoomConnection accessions = None # type: np.ndarray with self.output().temporary_path() as out_file: for clustered in self.input(): with loompy.connect(clustered.fn, "r") as ds: logging.info("Split/pool from " + clustered.fn) logging.info("Masking outliers") min_pts = 10 eps_pct = 80 tsne_pos = np.vstack( (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose() # DBSCAN to find outliers nn = NearestNeighbors(n_neighbors=min_pts, algorithm="ball_tree", n_jobs=4) nn.fit(tsne_pos) knn = nn.kneighbors_graph(mode='distance') k_radius = knn.max(axis=1).toarray() epsilon = np.percentile(k_radius, eps_pct) clusterer = DBSCAN(eps=epsilon, min_samples=min_pts) labels = clusterer.fit_predict(tsne_pos) # Mask out cells that don't match the class of their local neighbors logging.info("Masking cells in bad neighborhoods") temp = [] for ix in range(ds.shape[1]): if labels[ix] == -1: continue if ds.ca.Class[ix] == self.major_class: neighbors = ds.col_graphs.KNN.col[np.where( ds.col_graphs.KNN.row == ix)[0]] neighborhood = ds.ca.Class[ neighbors] == self.major_class if neighborhood.sum( ) / neighborhood.shape[0] > 0.2: temp.append(ix) cells = np.array(temp) if self.major_class == "Oligos": # Special selection of cells for the oligo class, to balance between tissues enough_genes = ds.map( (np.count_nonzero, ), axis=1)[0] > 1000 has_pdgfra = ds[ds.ra.Gene == "Pdgfra", :][0] > 0 has_meg3 = ds[ds.ra.Gene == "Meg3", :][0] > 0 is_doublet = np.zeros(ds.shape[1], dtype='bool') for g in [ 'Stmn2', 'Aqp4', 'Gja1', 'C1qc', 'Aif1', 'Cldn5', 'Fn1', 'Hbb-bt', 'Hbb-bh1', 'Hbb-bh2', 'Hbb-y', 'Hbb-bs', 'Hba-a1', 'Hba-a2', 'Hba-x' ]: is_doublet = np.logical_or( is_doublet, ds[ds.ra.Gene == g, :][0] > 0) ok_cells = enough_genes & (~is_doublet) & (has_pdgfra | ~has_meg3) cells = np.intersect1d(cells, np.where(ok_cells)[0]) if cells.shape[0] > 5000: cells = np.random.choice(cells, 5000, False) for (_, _, view) in ds.scan(items=cells, axis=1, key="Accession"): loompy.create_append(out_file, view.layers, view.ra, view.ca) with loompy.connect(out_file) as dsout: logging.info("Learning the manifold") if self.major_class == "Oligos": ml = cg.ManifoldLearning2(n_genes=self.n_genes, alpha=self.alpha) else: ml = cg.ManifoldLearning2(n_genes=self.n_genes, gtsne=self.gtsne, alpha=self.alpha) (knn, mknn, tsne) = ml.fit(dsout) dsout.col_graphs.KNN = knn dsout.col_graphs.MKNN = mknn dsout.ca._X = tsne[:, 0] dsout.ca._Y = tsne[:, 1] logging.info("Clustering on the manifold") pl = cg.PolishedLouvain() labels = pl.fit_predict(dsout) dsout.ca.Clusters = labels + 1 dsout.ca.Outliers = (labels == -1).astype('int') logging.info(f"Found {labels.max() + 1} clusters")