Exemple #1
0
	def fit(self, ds: loompy.LoomConnection, normalizer: cg.Normalizer, cells: np.ndarray = None) -> None:
		if cells is None:
			cells = np.fromiter(range(ds.shape[1]), dtype='int')
		n_cells = cells.shape[0]
		n_genes = self.genes.shape[0]

		# Support out-of-order datasets
		if "Accession" in ds.row_attrs:
			self.accessions = ds.row_attrs["Accession"]

		self.pca = IncrementalPCA(n_components=self.n_components)
		if self.layer is not None:
			# NOTE TO AVOID a BUG with layer of pickled objects
			try:
				for (ix, selection, view) in ds.scan(items=cells, axis=1, layers=[self.layer]):
					vals = normalizer.transform(view.layers[self.layer][:, :], selection)
					if self.nng is not None:
						vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0
					self.pca.partial_fit(vals[self.genes, :].transpose())		# PCA on the selected genes
			except AttributeError:
				self.layer = None
		if self.layer is None:
			for (ix, selection, view) in ds.scan(items=cells, axis=1):
				vals = normalizer.transform(view[:, :], selection)
				if self.nng is not None:
					vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0
				self.pca.partial_fit(vals[self.genes, :].transpose())		# PCA on the selected genes
Exemple #2
0
    def fit_transform(self, ds: loompy.LoomConnection) -> None:
        # Poisson pooling
        self.fit(ds)
        knn = self.knn.astype("bool")

        logging.debug(f"Poisson pooling")
        ds["pooled"] = 'int32'
        if self.compute_velocity and "spliced" in ds.layers:
            ds["spliced_pooled"] = 'int32'
            ds["unspliced_pooled"] = 'int32'
            for (_, indexes, view) in ds.scan(axis=0,
                                              layers=["spliced", "unspliced"],
                                              what=["layers"]):
                ds["spliced_pooled"][
                    indexes.min():indexes.max() +
                    1, :] = view.layers["spliced"][:, :] @ knn.T
                ds["unspliced_pooled"][
                    indexes.min():indexes.max() +
                    1, :] = view.layers["unspliced"][:, :] @ knn.T
                ds["pooled"][indexes.min():indexes.max() +
                             1, :] = ds["spliced_pooled"][indexes.min(
                             ):indexes.max() + 1, :] + ds["unspliced_pooled"][
                                 indexes.min():indexes.max() + 1, :]
        else:
            for (_, indexes, view) in ds.scan(axis=0,
                                              layers=[""],
                                              what=["layers"]):
                ds["pooled"][indexes.min():indexes.max() +
                             1, :] = view[:, :] @ knn.T
Exemple #3
0
	def fit(self, ds: loompy.LoomConnection, mu: np.ndarray = None, sd: np.ndarray = None, totals: np.ndarray = None) -> None:
		self.sd = sd
		self.mu = mu
		self.totals = totals

		if mu is None or sd is None:
			(self.sd, self.mu) = ds.map([np.std, np.mean], axis=0)
		if totals is None:
			self.totals = ds.map([np.sum], chunksize=100, axis=1)[0]
    def _fit(self, ds: loompy.LoomConnection,
             labels: np.ndarray) -> np.ndarray:
        logging.info("Computing enrichment statistic")
        n_labels = len(np.unique(labels))
        n_genes, n_cells = ds.shape

        # Number of cells per cluster
        sizes = np.bincount(labels, minlength=n_labels)
        # Number of nonzero values per cluster
        nnz = ds.aggregate(None, None, labels, np.count_nonzero, None)
        # Mean value per cluster
        means = ds.aggregate(None, None, labels, "mean", None)
        # Non-zeros and means over all cells
        (nnz_overall, means_overall) = ds.map([np.count_nonzero, np.mean],
                                              axis=0)
        # Scale by number of cells
        f_nnz = nnz / sizes
        f_nnz_overall = nnz_overall / n_cells

        # Means and fraction non-zero values in other clusters (per cluster)
        means_other = ((means_overall * n_cells)[None].T -
                       (means * sizes)) / (n_cells - sizes)
        f_nnz_other = ((f_nnz_overall * n_cells)[None].T -
                       (f_nnz * sizes)) / (n_cells - sizes)

        # enrichment = (f_nnz + 0.1) / (f_nnz_overall[None].T + 0.1) * (means + 0.01) / (means_overall[None].T + 0.01)
        enrichment = (f_nnz + 0.1) / (f_nnz_other + 0.1) * (means + 0.01) / (
            means_other + 0.01)

        # Select best markers
        if self.valid_genes is None:
            logging.info("Identifying valid genes")
            nnz = ds.map([np.count_nonzero], axis=0)[0]
            self.valid_genes = np.logical_and(nnz > 10,
                                              nnz < ds.shape[1] * 0.6)

        if self.mask is None:
            excluded = set(np.where(~self.valid_genes)[0])
        else:
            excluded = set(np.where(((~self.valid_genes) | self.mask))[0])

        included = np.zeros(n_genes, dtype=bool)
        for ix in range(n_labels):
            enriched = np.argsort(enrichment[:, ix])[::-1]
            n = 0
            count = 0
            while count < self.n_markers_per_cluster:
                if enriched[n] in excluded:
                    n += 1
                    continue
                included[enriched[n]] = True
                excluded.add(enriched[n])
                n += 1
                count += 1
        return (included, enrichment, means)
def mito_genes_ratio(ds: loompy.LoomConnection) -> None:
    mito_genes = np.where(ds.ra.Chromosome == "MT")[0]
    exp_mito = ds[mito_genes, :]
    sum_mito = exp_mito.sum(axis=0)
    sum_all = ds.ca.TotalUMI
    mito_ratio = np.divide(sum_mito, sum_all)
    ds.ca["MT_ratio"] = mito_ratio
Exemple #6
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        self.sd = np.zeros(ds.shape[0])
        self.mu = np.zeros(ds.shape[0])
        self.totals = np.zeros(ds.shape[1])

        for _, selection, view in ds.scan(axis=0):
            vals = view[self.layer][:, :].astype("float")
            self.totals += np.sum(vals, axis=0)
        self.level = np.median(self.totals)

        for _, selection, view in ds.scan(axis=0):
            vals = view[self.layer][:, :].astype("float")
            # Rescale to the median total UMI count, plus 1 (to avoid log of zero), then log transform
            vals = np.log2(div0(vals, self.totals) * self.level + 1)
            self.mu[selection] = np.mean(vals, axis=1)
            self.sd[selection] = np.std(vals, axis=1)
Exemple #7
0
def expression_patterns(ds: loompy.LoomConnection, labels: np.ndarray, pep: float, f: float, cells: np.ndarray = None) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
	"""
	Derive enrichment and trinary scores for all genes

	Args:
		ds (LoomConnection):	Dataset
		labels (numpy array):	Cluster labels (one per cell)
		pep (float):			Desired posterior error probability
		f (float):				Fraction required for a gene to be considered 'expressed'
		cells (nump array):		Indices of cells to include

	Returns:
		score1 (numpy 2d array):	Array of (n_genes, n_labels)
		score2 (numpy 2d array):	Array of (n_genes, n_labels)
		trinary (numpy 2d array):		Array of (n_genes, n_labels)

	Remarks:
		If the cells argument is provided, the labels should include only those cells. That is,
		labels.shape[0] == cells.shape[0].

		Amit says,
		regarding marker genes.
		i usually rank the genes by some kind of enrichment score.
		score1 = mean of gene within the cluster / mean of gene in all cells
		score2 = fraction of positive cells within cluster

		enrichment score = score1 * score2^power   (where power == 0.5 or 1) i usually use 1 for 10x data
	"""

	n_labels = np.max(labels) + 1

	scores1 = np.empty((ds.shape[0], n_labels))
	scores2 = np.empty((ds.shape[0], n_labels))
	trinary_pat = np.empty((ds.shape[0], n_labels))
	trinary_prob = np.empty((ds.shape[0], n_labels))

	j = 0
	for (ix, selection, vals) in ds.batch_scan(cells=cells, genes=None, axis=0):
		# vals = normalizer.normalize(vals, selection)
		for j, row in enumerate(selection):
			data = vals[j, :]
			mu0 = np.mean(data)
			f0 = np.count_nonzero(data)
			score1 = np.zeros(n_labels)
			score2 = np.zeros(n_labels)
			for lbl in range(n_labels):
				if np.sum(labels == lbl) == 0:
					continue
				sel = data[np.where(labels == lbl)[0]]
				if mu0 == 0 or f0 == 0:
					score1[lbl] = 0
					score2[lbl] = 0
				else:
					score1[lbl] = np.mean(sel) / mu0
					score2[lbl] = np.count_nonzero(sel) #  f0
			scores1[row, :] = score1
			scores2[row, :] = score2
			trinary_prob[row, :], trinary_pat[row, :] = betabinomial_trinarize_array(data, labels, pep, f)
	return (scores1, scores2, trinary_prob, trinary_pat)
Exemple #8
0
	def transform(self, ds: loompy.LoomConnection, normalizer: cg.Normalizer, cells: np.ndarray = None) -> np.ndarray:
		if cells is None:
			cells = np.fromiter(range(ds.shape[1]), dtype='int')
		n_cells = cells.shape[0]

		# Support out--of-order datasets
		if self.accessions is not None:
			# This is magic sauce for making the order of one list be like another
			ordering = np.where(ds.row_attrs["Accession"][None, :] == self.accessions[:, None])[1]

		transformed = np.zeros((cells.shape[0], self.pca.n_components_))
		j = 0

		if self.layer is not None:
			# NOTE TO AVOID a BUG with layer of pickled objects
			try:
				for (ix, selection, view) in ds.scan(items=cells, axis=1, layers=[self.layer]):
					vals = normalizer.transform(view.layers[self.layer][:, :], selection)
					if self.nng is not None:
						vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0
					n_cells_in_batch = selection.shape[0]
					transformed[j:j + n_cells_in_batch, :] = self.pca.transform(vals[self.genes, :].transpose())
					j += n_cells_in_batch
			except AttributeError:
				self.layer = None
		if self.layer is None:
			for (ix, selection, view) in ds.scan(items=cells, axis=1):
				vals = normalizer.transform(view[:, :], selection)
				if self.nng is not None:
					vals[np.where(self.nng)[0][:, None], np.where(ds.TaxonomyRank1 == "Neurons")[0]] = 0
				n_cells_in_batch = selection.shape[0]
				transformed[j:j + n_cells_in_batch, :] = self.pca.transform(vals[self.genes, :].transpose())
				j += n_cells_in_batch

		# Must select significant components only once, and reuse for future transformations
		if self.sigs is None:
			pvalue_KS = np.zeros(transformed.shape[1])  # pvalue of each component
			for i in range(1, transformed.shape[1]):
				(_, pvalue_KS[i]) = ks_2samp(transformed[:, i - 1], transformed[:, i])
			self.sigs = np.where(pvalue_KS < 0.1)[0]
			if len(self.sigs) == 0:
				self.sigs = (0, 1)

		transformed = transformed[:, self.sigs]

		return transformed
def unspliced_ratio(ds: loompy.LoomConnection,
                    graphs: bool = True,
                    sample_name: object = "tmp") -> None:
    u = ds.layers["unspliced"][:]
    sum_all = ds.ca.TotalUMI
    sum_u = u.sum(axis=0)
    unspliced_ratio = np.divide(sum_u, sum_all)
    ds.ca["unspliced_ratio"] = unspliced_ratio
Exemple #10
0
	def fit(self, ds: loompy.LoomConnection, plot: str = None) -> np.ndarray:
		"""
		Fit a classifier and use it to determine cluster predictive power

		Args:
			ds		Dataset
			plot	Filename for optional plot

		Returns:
			Matrix of classification probabilities, shape (n_cells, n_labels)
		"""
		logging.info("Feature selection")
		nnz = ds.map([np.count_nonzero], axis=0)[0]
		valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int")
		ds.ra._Valid = valid_genes

		logging.info("Normalization")
		normalizer = cg.Normalizer(False)
		normalizer.fit(ds)

		logging.info("Feature selection")
		(_, enrichment, _) = cg.MarkerSelection(findq=False, labels_attr="Clusters").fit(ds)
		genes = np.zeros_like(ds.ra.Gene, dtype=bool)
		for ix in range(enrichment.shape[1]):
			genes[np.argsort(-enrichment[:, ix])[:25]] = True

		logging.info("PCA projection")
		pca = cg.PCAProjection(genes, max_n_components=50)
		transformed = pca.fit_transform(ds, normalizer)

		le = LabelEncoder().fit(ds.ca.ClusterName)
		self.le = le
		labels = le.transform(ds.ca.ClusterName)

		train_X, test_X, train_Y, test_Y = train_test_split(transformed, labels, test_size=0.2)
		classifier = RandomForestClassifier(max_depth=30)
		classifier.fit(train_X, train_Y)
		self.report = classification_report(test_Y, classifier.predict(test_X), target_names=le.classes_)
		self.proba = classifier.predict_proba(transformed)

		if plot:
			agg = npg.aggregate(labels, self.proba, axis=0, func="mean")
			plt.imshow(agg, cmap="viridis")
			plt.xticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="vertical", fontsize=7)
			plt.yticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="horizontal", fontsize=7)
			plt.xlabel("Predicted cell type")
			plt.ylabel("Observed cell type")
			plt.title("Predictive power of cluster identities")
			cbar = plt.colorbar()
			cbar.set_label('Average classification probability', rotation=90)
			plt.savefig(plot, bbox_inches="tight")

		return self.proba
Exemple #11
0
	def fit_loom(self, ds: loompy.LoomConnection, *, tolayer: str = "enrichment", knn: Union[str, sparse.csr_matrix] = "KNN") -> None:
		if tolayer not in ds.layers:
			ds[tolayer] = "float32"
		if type(knn) is str:
			knn_matrix = ds.col_graphs[knn].tocsr()
		else:
			knn_matrix = knn
		k = knn_matrix.count_nonzero() / knn_matrix.shape[0]
		with tqdm(total=ds.shape[0], desc="Neighborhood enrichment") as pbar:
			for ix, selection, view in ds.scan(axis=0, what=["layers"]):
				for j in range(view.shape[0]):
					ds[tolayer][j + ix, :] = self.fit(view[j, :], knn_matrix, k)
				pbar.update(view.shape[0])
Exemple #12
0
    def transform(self,
                  ds: loompy.LoomConnection,
                  normalizer: Normalizer,
                  cells: np.ndarray = None) -> np.ndarray:
        if cells is None:
            cells = np.arange(ds.shape[1])

        transformed = np.zeros((cells.shape[0], self.pca.n_components_))
        j = 0

        # Support out-of-order datasets
        key = None
        if "Accession" in ds.row_attrs:
            key = "Accession"

        layer = self.layer if self.layer is not None else ""
        for (_, selection, view) in ds.scan(items=cells,
                                            axis=1,
                                            layers=[layer],
                                            key=key):
            vals = normalizer.transform(view.layers[layer][:, :], selection)
            n_cells_in_batch = selection.shape[0]
            transformed[j:j + n_cells_in_batch, :] = self.pca.transform(
                vals[self.genes, :].transpose())
            j += n_cells_in_batch

        if self.test_significance:
            # Must select significant components only once, and reuse for future transformations
            if self.sigs is None:
                pvalue_KS = np.zeros(
                    transformed.shape[1])  # pvalue of each component
                for i in range(1, transformed.shape[1]):
                    (_, pvalue_KS[i]) = ks_2samp(transformed[:, i - 1],
                                                 transformed[:, i])
                self.sigs = np.where(pvalue_KS < 0.1)[0]
                if len(self.sigs) == 0:
                    self.sigs = (0, 1)

            transformed = transformed[:, self.sigs]

        if self.batch_keys is not None and len(self.batch_keys) > 0:
            keys_df = pd.DataFrame.from_dict(
                {k: ds.ca[k]
                 for k in self.batch_keys})
            transformed = harmonize(transformed,
                                    keys_df,
                                    batch_key=self.batch_keys)
        return transformed
Exemple #13
0
    def fit(self, ds: loompy.LoomConnection) -> np.ndarray:
        cells = np.where(ds.col_attrs["Clusters"] >= 0)[0]
        labels = ds.col_attrs["Clusters"][cells]
        n_labels = np.max(labels) + 1
        logging.info("n_labels %d", n_labels)
        self.trinary_prob = np.empty((ds.shape[0], n_labels))
        self.genes = ds.ra.Gene

        for (ix, selection, view) in ds.scan(axis=0, what=["layers"]):
            vals = view[:, cells]
            for j, row in enumerate(selection):
                data = np.round(vals[j, :])
                self.trinary_prob[row, :] = self._betabinomial_trinarize_array(
                    data, labels, self.f, n_labels)

        return self.trinary_prob
    def fit(self,
            ds: loompy.LoomConnection,
            cells: np.ndarray = None,
            mu: np.ndarray = None,
            sd: np.ndarray = None,
            mask: np.ndarray = None) -> np.ndarray:
        """
		Fits a noise model (CV vs mean)

		Args:
			ds (LoomConnection):	Dataset
			n_genes (int):	number of genes to include
			cells (ndarray): cells to include when computing mean and CV (or None)
			mu, std: 		Precomputed mean and standard deviations (optional)

		Returns:
			ndarray of selected genes (list of ints)
		"""
        if mu is None or sd is None:
            (mu, sd) = ds.map((np.mean, np.std), axis=0, selection=cells)

        if "_Valid" in ds.ra:
            valid = ds.ra._Valid == 1
        else:
            valid = np.ones(ds.shape[0], dtype='bool')
        if mask is not None:
            valid = np.logical_and(valid, np.logical_not(mask))
        valid = np.logical_and(valid, ds.row_attrs["Gene"] != "Xist")
        valid = np.logical_and(valid, ds.row_attrs["Gene"] != "Tsix")
        valid = valid.astype('int')

        ok = np.logical_and(mu > 0, sd > 0)
        cv = sd[ok] / mu[ok]
        log2_m = np.log2(mu[ok])
        log2_cv = np.log2(cv)

        svr_gamma = 1000. / len(mu[ok])
        clf = SVR(gamma=svr_gamma)
        clf.fit(log2_m[:, np.newaxis], log2_cv)
        fitted_fun = clf.predict
        # Score is the relative position with respect of the fitted curve
        score = log2_cv - fitted_fun(log2_m[:, np.newaxis])
        score = score * valid[ok]
        self.genes = np.where(ok)[0][np.argsort(score)][-self.n_genes:]

        return self.genes
Exemple #15
0
def plot_knn(ds: loompy.LoomConnection, out_file: str) -> None:
    n_cells = ds.shape[1]
    valid = ds.col_attrs["_Valid"].astype('bool')
    (a, b, w) = ds.get_edges("MKNN", axis=1)
    mknn = sparse.coo_matrix(
        (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid]
    xy = np.vstack(
        (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :]

    fig = plt.figure(figsize=(10, 10))
    g = nx.from_scipy_sparse_matrix(mknn)
    ax = fig.add_subplot(111)

    nx.draw_networkx_edges(g, pos=xy, alpha=0.25, width=0.2, edge_color='gray')
    ax.axis('off')
    plt.tight_layout()
    fig.savefig(out_file, format="png", dpi=300)
    plt.close()
Exemple #16
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        # Validating genes
        logging.info("Marking invalid genes")
        nnz = ds.map([np.count_nonzero], axis=0)[0]
        valid_genes = np.logical_and(nnz > 5,
                                     nnz < ds.shape[1] * 0.5).astype("int")
        ds.ra._Valid = valid_genes
        with open(os.path.join(self.classified_dir, "genes.txt"), "w") as f:
            for ix in range(valid_genes.shape[0]):
                f.write(ds.Accession[ix])
                f.write("\t")
                f.write(str(valid_genes[ix]))
                f.write("\n")

        logging.info("Normalization")
        normalizer = cg.Normalizer(True)
        normalizer.fit(ds)
        self.mu = normalizer.mu
        self.sd = normalizer.sd

        logging.info("Feature selection")
        genes = cg.FeatureSelection(2000).fit(ds)

        logging.info("PCA projection")
        self.pca = cg.PCAProjection(genes, max_n_components=50)
        transformed = self.pca.fit_transform(ds, normalizer)

        self.classes = ds.col_attrs["SubclassAssigned"]
        self.le = LabelEncoder().fit(self.classes)
        self.labels = self.le.transform(self.classes)

        train_X, test_X, train_Y, test_Y = train_test_split(transformed,
                                                            self.labels,
                                                            test_size=0.2,
                                                            random_state=0)
        self.classifier = SVC(probability=True)
        self.classifier.fit(train_X, train_Y)
        with open(os.path.join(self.classified_dir, "performance.txt"),
                  "w") as f:
            f.write(
                classification_report(test_Y,
                                      self.classifier.predict(test_X),
                                      target_names=self.le.classes_))
Exemple #17
0
	def connect(self, project: str, filename: str, mode: str="r+", timeout: float=10) -> LoomConnection:
		"""
		Try to connect to a local loom file. If returns None already connected.

		Uses a semaphore to ensure there is never more than one connection
		open to a loom file (as long as this is the only object that connects to the loom file)

		Remember to call `release(project, filename)` after closing the connection!

		Args:
			- project (string): 		Name of the project (e.g. "Midbrain")
			- filename (string): 		Filename of the loom file (e.g. "Midbrain_20160701.loom")

		Returns:
			A loom file connection, or None if file does not exist or was already connected.
		"""
		if self.acquire(project, filename, timeout):
			absolute_path = self.list.absolute_file_path(project, filename)
			return LoomConnection(absolute_path, mode)
		return None
Exemple #18
0
    def fit(self,
            ds: loompy.LoomConnection,
            normalizer: Normalizer,
            cells: np.ndarray = None) -> None:
        if cells is None:
            cells = np.fromiter(range(ds.shape[1]), dtype='int')

        # Support out-of-order datasets
        key = None
        if "Accession" in ds.row_attrs:
            key = "Accession"

        self.pca = IncrementalPCA(n_components=self.n_components)
        layer = self.layer if self.layer is not None else ""
        for (_, selection, view) in ds.scan(items=cells,
                                            axis=1,
                                            layers=[layer],
                                            key=key):
            if len(selection) < self.n_components:
                continue
            vals = normalizer.transform(view.layers[layer][:, :], selection)
            self.pca.partial_fit(
                vals[self.genes, :].transpose())  # PCA on the selected genes
Exemple #19
0
	def tile(self, project: str, filename: str, truncate: bool = False) -> None:
		absolute_path = self.list.absolute_file_path(project, filename)
		ds = None
		if absolute_path is not "":
			try:
				lock = self.connections.dataset_locks.get(absolute_path)
				if lock is not None and lock.acquire(blocking=True, timeout=10):
					ds = LoomConnection(absolute_path, 'r')
					tiles = LoomTiles(ds)
					tiles.prepare_heatmap(truncate)
					ds.close()
					lock.release()
			except TimeoutError:
				# May happen when cancelled by the environment (for example, on a server).
				# If so, and the lock was acquired, release the dataset and lock
				if ds is not None:
					ds.close()
					lock.release()
				pass
Exemple #20
0
    def aggregate(self,
                  ds: loompy.LoomConnection,
                  out_file: str,
                  agg_spec: Dict[str, str] = None) -> None:
        if agg_spec is None:
            agg_spec = {
                "Age": "tally",
                "Clusters": "first",
                "Class": "mode",
                "_Total": "mean",
                "Sex": "tally",
                "Tissue": "tally",
                "SampleID": "tally",
                "TissuePool": "first",
                "Outliers": "mean"
            }
        cells = ds.col_attrs["Clusters"] >= 0
        labels = ds.col_attrs["Clusters"][cells]
        n_labels = len(set(labels))

        logging.info("Aggregating clusters by mean")
        cg.aggregate_loom(ds, out_file, None, "Clusters", "mean", agg_spec)
        with loompy.connect(out_file) as dsout:
            logging.info("Trinarizing")
            if type(self.f) is list or type(self.f) is tuple:
                for ix, f in enumerate(self.f):
                    trinaries = cg.Trinarizer(f=f).fit(ds)
                    if ix == 0:
                        dsout.layers["trinaries"] = trinaries
                    else:
                        dsout.layers[f"trinaries_{f}"] = trinaries
            else:
                trinaries = cg.Trinarizer(f=self.f).fit(ds)
                dsout.layers["trinaries"] = trinaries

            logging.info("Computing cluster gene enrichment scores")
            (markers, enrichment,
             qvals) = cg.MarkerSelection(self.n_markers).fit(ds)
            dsout.layers["enrichment"] = enrichment
            dsout.layers["enrichment_q"] = qvals

            dsout.ca.NCells = np.bincount(labels, minlength=n_labels)

            # Renumber the clusters
            logging.info(
                "Renumbering clusters by similarity, and permuting columns")
            if "_Selected" in ds.ra:
                genes = (ds.ra._Selected == 1)
            else:
                logging.info("Normalization")
                normalizer = cg.Normalizer(False)
                normalizer.fit(ds)
                logging.info("Selecting up to 1000 genes")
                genes = cg.FeatureSelection(1000).fit(ds,
                                                      mu=normalizer.mu,
                                                      sd=normalizer.sd)

            data = np.log(dsout[:, :] + 1)[genes, :].T
            D = pdist(data, 'euclidean')
            Z = hc.linkage(D, 'ward')
            optimal_Z = optimal_leaf_ordering(Z, D)
            ordering = hc.leaves_list(optimal_Z)

            # Permute the aggregated file, and renumber
            dsout.permute(ordering, axis=1)
            dsout.ca.Clusters = np.arange(n_labels)

            # Renumber the original file, and permute
            d = dict(zip(ordering, np.arange(n_labels)))
            new_clusters = np.array(
                [d[x] if x in d else -1 for x in ds.ca.Clusters])
            ds.ca.Clusters = new_clusters
            ds.permute(np.argsort(ds.col_attrs["Clusters"]), axis=1)

            # Reorder the genes, markers first, ordered by enrichment in clusters
            logging.info("Permuting rows")
            mask = np.zeros(ds.shape[0], dtype=bool)
            mask[markers] = True
            # fetch enrichment from the aggregated file, so we get it already permuted on the column axis
            gene_order = np.zeros(ds.shape[0], dtype='int')
            gene_order[mask] = np.argmax(dsout.layer["enrichment"][mask, :],
                                         axis=1)
            gene_order[~mask] = np.argmax(dsout.layer["enrichment"][~mask, :],
                                          axis=1) + dsout.shape[1]
            gene_order = np.argsort(gene_order)
            ds.permute(gene_order, axis=0)
            dsout.permute(gene_order, axis=0)

            data = trinaries[:, ordering][gene_order, :][:self.n_markers *
                                                         n_labels, :].T
            cluster_scores = []
            for ix in range(n_labels):
                cluster_scores.append(data[ix, ix * 10:(ix + 1) * 10].sum())
            dsout.ca.ClusterScore = np.array(cluster_scores)
Exemple #21
0
def plot_graph(ds: loompy.LoomConnection,
               out_file: str,
               tags: List[str] = None) -> None:
    logging.info("Loading graph")
    n_cells = ds.shape[1]
    cells = np.where(ds.col_attrs["_Valid"] == 1)[0]
    has_edges = False
    if "MKNN" in ds.list_edges(axis=1):
        (a, b, w) = ds.get_edges("MKNN", axis=1)
        has_edges = True
    pos = np.vstack((ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()
    labels = ds.col_attrs["Clusters"]
    if "Outliers" in ds.col_attrs:
        outliers = ds.col_attrs["Outliers"]
    else:
        outliers = np.zeros(ds.shape[1])
    # Compute a good size for the markers, based on local density
    logging.info("Computing node size")
    min_pts = 50
    eps_pct = 60
    nn = NearestNeighbors(n_neighbors=min_pts, algorithm="ball_tree", n_jobs=4)
    nn.fit(pos)
    knn = nn.kneighbors_graph(mode='distance')
    k_radius = knn.max(axis=1).toarray()
    epsilon = 24 * np.percentile(k_radius, eps_pct)

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)

    # Draw edges
    if has_edges:
        logging.info("Drawing edges")
        lc = LineCollection(zip(pos[a], pos[b]),
                            linewidths=0.25,
                            zorder=0,
                            color='grey',
                            alpha=0.1)
        ax.add_collection(lc)

    # Draw nodes
    logging.info("Drawing nodes")
    colors20 = np.vstack((plt.cm.Vega20b(np.linspace(0., 1, 20))[::2],
                          plt.cm.Vega20c(np.linspace(0, 1, 20))[1::2]))
    plots = []
    names = []
    for i in range(max(labels) + 1):
        cluster = labels == i
        n_cells = cluster.sum()
        if np.all(outliers[labels == i] == 1):
            edgecolor = colorConverter.to_rgba('red', alpha=.1)
            plots.append(
                plt.scatter(x=pos[outliers == 1, 0],
                            y=pos[outliers == 1, 1],
                            c='grey',
                            marker='.',
                            edgecolors=edgecolor,
                            alpha=0.1,
                            s=epsilon))
            names.append(f"{i}/n={n_cells}  (outliers)")
        else:
            plots.append(
                plt.scatter(x=pos[cluster, 0],
                            y=pos[cluster, 1],
                            c=cg.colors75[np.mod(i, 75)],
                            marker='.',
                            lw=0,
                            s=epsilon,
                            alpha=0.75))
            txt = str(i)
            if "ClusterName" in ds.ca.keys():
                txt = ds.ca.ClusterName[ds.ca.Clusters == i][0]
            if tags is not None:
                names.append(f"{txt}/n={n_cells} " +
                             tags[i].replace("\n", " "))
            else:
                names.append(f"{txt}/n={n_cells}")
    logging.info("Drawing legend")
    plt.legend(plots,
               names,
               scatterpoints=1,
               markerscale=2,
               loc='upper left',
               bbox_to_anchor=(1, 1),
               fancybox=True,
               framealpha=0.5,
               fontsize=10)

    logging.info("Drawing cluster IDs")
    for lbl in range(0, max(labels) + 1):
        txt = str(lbl)
        if "ClusterName" in ds.ca.keys():
            txt = ds.ca.ClusterName[ds.ca.Clusters == lbl][0]
        if np.all(outliers[labels == lbl] == 1):
            continue
        if np.sum(labels == lbl) == 0:
            continue
        (x, y) = np.median(pos[np.where(labels == lbl)[0]], axis=0)
        ax.text(x,
                y,
                txt,
                fontsize=12,
                bbox=dict(facecolor='white', alpha=0.5, ec='none'))
    logging.info("Saving to file")
    fig.savefig(out_file, format="png", dpi=144, bbox_inches='tight')
    plt.close()
Exemple #22
0
def plot_classification(ds: loompy.LoomConnection, out_file: str) -> None:
    n_cells = ds.shape[1]
    valid = ds.col_attrs["_Valid"].astype('bool')
    (a, b, w) = ds.get_edges("MKNN", axis=1)
    mknn = sparse.coo_matrix(
        (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid]
    pos = np.vstack(
        (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :]
    labels = ds.col_attrs["Clusters"][valid]

    fig = plt.figure(figsize=(10, 10))
    g = nx.from_scipy_sparse_matrix(mknn)
    classes = [
        "Neurons", "Astrocyte", "Ependymal", "OEC", "Oligos", "Schwann",
        "Cycling", "Vascular", "Immune"
    ]
    colors = [plt.cm.get_cmap('Vega20')((ix + 0.5) / 20) for ix in range(20)]

    combined_colors = np.zeros((ds.shape[1], 4)) + np.array((0.5, 0.5, 0.5, 0))

    for ix, cls in enumerate(classes):
        cmap = LinearSegmentedColormap.from_list('custom cmap',
                                                 [(1, 1, 1, 0), colors[ix]])
        cells = ds.col_attrs["Class0"] == classes[ix]
        if np.sum(cells) > 0:
            combined_colors[cells] = [
                cmap(x) for x in ds.col_attrs["Class_" + classes[ix]][cells]
            ]

    cmap = LinearSegmentedColormap.from_list('custom cmap',
                                             [(1, 1, 1, 0), colors[ix + 1]])
    ery_color = np.array(
        [[1, 1, 1, 0],
         [0.9, 0.71, 0.76,
          0]])[(ds.col_attrs["Class"][valid] == "Erythrocyte").astype('int')]
    cells = ds.col_attrs["Class0"] == "Erythrocyte"
    if np.sum(cells) > 0:
        combined_colors[cells] = np.array([1, 0.71, 0.76, 0])

    cmap = LinearSegmentedColormap.from_list('custom cmap',
                                             [(1, 1, 1, 0), colors[ix + 2]])
    exc_color = np.array(
        [[1, 1, 1, 0],
         [0.5, 0.5, 0.5,
          0]])[(ds.col_attrs["Class0"][valid] == "Excluded").astype('int')]
    cells = ds.col_attrs["Class0"] == "Excluded"
    if np.sum(cells) > 0:
        combined_colors[cells] = np.array([0.5, 0.5, 0.5, 0])

    ax = fig.add_subplot(1, 1, 1)
    ax.set_title("Class")
    nx.draw_networkx_edges(g, pos=pos, alpha=0.2, width=0.1, edge_color='gray')
    nx.draw_networkx_nodes(g,
                           pos=pos,
                           node_color=combined_colors[valid],
                           node_size=10,
                           alpha=0.6,
                           linewidths=0)
    ax.axis('off')

    plt.tight_layout()
    fig.savefig(out_file, format="png", dpi=300)
    plt.close()
Exemple #23
0
def plot_graph_age(ds: loompy.LoomConnection, out_file: str,
                   tags: List[str]) -> None:
    def parse_age(age: str) -> float:
        if age == "":
            return 0
        unit, amount = age[0], float(age[1:])
        if unit == "P":
            amount += 19.
        return amount

    n_cells = ds.shape[1]
    valid = ds.col_attrs["_Valid"].astype('bool')

    (a, b, w) = ds.get_edges("MKNN", axis=1)
    mknn = sparse.coo_matrix(
        (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid]
    sfdp = np.vstack(
        (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :]
    # The sorting below is to make every circle visible and avoid overlappings in crowded situations
    orderx = np.argsort(sfdp[:, 0], kind="mergesort")
    ordery = np.argsort(sfdp[:, 1], kind="mergesort")
    orderfin = orderx[ordery]
    sfdp_original = sfdp.copy(
    )  # still the draw_networkx_edges wants the sfd with respect of the graph `g`
    # \it is shortcut to avoid resorting the graph
    sfdp = sfdp[orderfin, :]
    labels = ds.col_attrs["Clusters"][valid][orderfin]
    age = np.fromiter(map(parse_age, ds.col_attrs["Age"]),
                      dtype=float)[valid][orderfin]

    fig = plt.figure(figsize=(10, 10))
    g = nx.from_scipy_sparse_matrix(mknn)
    ax = fig.add_subplot(111)

    # Draw the KNN graph first, with gray transparent edges
    nx.draw_networkx_edges(g,
                           pos=sfdp_original,
                           alpha=0.1,
                           width=0.1,
                           edge_color='gray')
    # Then draw the nodes, colored by label
    block_colors = plt.cm.nipy_spectral_r((age - 6) / 14.)
    nx.draw_networkx_nodes(g,
                           pos=sfdp,
                           node_color=block_colors,
                           node_size=10,
                           alpha=0.4,
                           linewidths=0)

    for lbl in range(0, max(labels) + 1):
        if np.sum(labels == lbl) == 0:
            continue
        (x, y) = np.median(sfdp[np.where(labels == lbl)[0]], axis=0)
        text = "#" + str(lbl)
        if len(tags[lbl]) > 0:
            text += "\n" + tags[lbl]
        ax.text(x,
                y,
                text,
                fontsize=8,
                bbox=dict(facecolor='gray', alpha=0.3, ec='none'))
    ax.axis('off')
    levels = np.unique(age)
    for il, lev in enumerate(levels):
        ax.add_patch(
            plt.Rectangle((0.90, 0.7 + il * 0.016),
                          0.014,
                          0.014,
                          color=plt.cm.nipy_spectral_r((lev - 6) / 14.),
                          clip_on=0,
                          transform=ax.transAxes))
        ax.text(0.93,
                0.703 + il * 0.016,
                ("E%.1f" % lev if lev < 18.5 else "P%.1f" % (lev - 19)),
                transform=ax.transAxes)
    plt.tight_layout()
    fig.savefig(out_file, format="png", dpi=300)
    plt.close()
Exemple #24
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        logging.info("Computing pseudoage")
        ages = np.array([age_to_num(x) for x in ds.ca.Age])
        knn = ds.col_graphs.KNN
        k = knn.nnz / knn.shape[0]
        ds.ca.PseudoAge = (knn.astype("bool") @ ages) / k

        logging.info("Slicing pseudoage")
        slice_names: List[str] = []
        with TemporaryDirectory() as tempfolder:
            slices = np.percentile(ds.ca.PseudoAge, np.arange(0, 101, 5))
            logging.info("Collecting cells")
            for (ix, _, view) in ds.scan(axis=1):
                for i in range(len(slices) - 2):
                    s1 = slices[i]
                    s2 = slices[i + 2]
                    slice_name = f"Age{s1:05.2f}to{s2:05.2f}".replace(
                        ".", "") + ".loom"
                    if slice_name not in slice_names:
                        slice_names.append(slice_name)
                    cells = ((view.ca.PseudoAge >= s1) &
                             (view.ca.PseudoAge < s2))
                    if cells.sum() == 0:
                        continue
                    fname = os.path.join(tempfolder, slice_name)
                    if not os.path.exists(fname):
                        with loompy.new(fname) as dsout:
                            dsout.add_columns(view.layers[:, cells],
                                              col_attrs=view.ca[cells],
                                              row_attrs=view.ra)
                    else:
                        with loompy.connect(fname) as dsout:
                            dsout.add_columns(view.layers[:, cells],
                                              col_attrs=view.ca[cells],
                                              row_attrs=view.ra)

            for slice_name in slice_names:
                fname = os.path.join(tempfolder, slice_name)
                logging.info("Cytograph on " + slice_name)
                with loompy.connect(fname) as ds:
                    Cytograph(config=load_config()).fit(ds)

            # Use dynamic programming to find the deepest tree (forest), as given by total number of cells along each branch
            logging.info("Computing pseudolineage")
            clusters = "Clusters"
            min_pct = 0.1

            # List of matrices giving the bipartite graph between each pair of layers, weighted by number of shared cells
            overlaps = []
            n_nodes = []  # List of number of nodes (clusters) in each layer
            n_cells = [
            ]  # List of arrays giving the number of cells in each cluster
            n_layers = len(slice_names)

            # Compute the bipartite graphs between layers
            for t in range(n_layers):
                # Link clusters from layer t to clusters from layer t + 1
                logging.info(f"{slice_names[t]}.loom")
                with loompy.connect(os.path.join(tempfolder,
                                                 slice_names[t])) as ds1:
                    n_nodes.append(ds1.ca[clusters].max() + 1)
                    n_cells.append(np.zeros(n_nodes[t]))
                    for c in range(n_nodes[t]):
                        n_cells[t][c] = (ds1.ca[clusters] == c).sum()
                    if t >= n_layers - 1:
                        break
                    with loompy.connect(
                            os.path.join(tempfolder,
                                         slice_names[t + 1])) as ds2:
                        overlap = np.zeros(
                            (np.unique(ds1.ca[clusters]).shape[0],
                             np.unique(ds2.ca[clusters]).shape[0]),
                            dtype="int")
                        for i in np.unique(ds1.ca[clusters]):
                            cells1 = ds1.ca.CellID[ds1.ca[clusters] == i]
                            for j in np.unique(ds2.ca[clusters]):
                                cells2 = ds2.ca.CellID[ds2.ca[clusters] == j]
                                overlap[i, j] = np.intersect1d(cells1,
                                                               cells2).shape[0]
                        overlaps.append(overlap)

            # List of arrays keeping track of the depth of the deepest tree starting at each node in the layer
            # Depth defined as sum of the number of shared cells along the branch
            depths = [np.zeros(n, dtype="int") for n in n_nodes]
            edges = [
                np.zeros(n, dtype="int") for n in n_nodes[1:]
            ]  # List of arrays giving the predecessor of each cluster (or -1 if no predecessor)
            for t in range(0, n_layers - 1):
                for i in range(n_nodes[t + 1]):
                    # Now find the widest deepest branch from any node j in layer t to node i in layer t + 1
                    # Widest, deepest meaning: greatest sum of depth up to node j in layer t plus number of shared cells
                    # But disallowing any branch with less than min_pct % shared cells
                    best_j = -1
                    best_depth = 0
                    for j in range(n_nodes[t]):
                        pct_overlapping = 100 * overlaps[t][j, i] / (
                            n_cells[t][j] + n_cells[t + 1][i])
                        if pct_overlapping > min_pct:
                            depth = depths[t][j] + overlaps[t][j, i]
                            if depth > best_depth:
                                best_depth = depth
                                best_j = j
                    edges[t][i] = best_j

            # Now we have
            #
            # edges:    List of arrays giving the index of the predecessor of each cluster (or -1 if no predecessor exists)
            # overlaps: List of matrices giving the number of cells shared between clusters in layer t and t + 1
            # n_nodes:  List of number of nodes (clusters) in each layer
            # n_cells:  List of arrays of number of cells in each node (cluster)

            # Now position the nodes of each layer such that no edges cross
            ypositions = [np.arange(n_nodes[0])]
            for t in range(len(edges)):
                pos = np.full(n_nodes[t + 1], -1)
                for i in range(pos.shape[0]):
                    prev = edges[t][i]
                    if (prev) >= 0:
                        pos[i] = ypositions[t][prev]
                ordering = np.argsort(pos)
                mapping = dict(zip(ordering, range(len(ordering))))
                ypositions.append(
                    np.array([mapping[i] for i in range(len(ordering))]))
            # Make the positions proportional to the number of cells (cumulative)
            max_pos = 0
            for i, pos in enumerate(ypositions):
                with loompy.connect(os.path.join(tempfolder,
                                                 slice_names[i])) as ds0:
                    n_clusters = ds0.ca[clusters].max() + 1
                    ncells = np.array([(ds0.ca[clusters] == i).sum()
                                       for i in range(n_clusters)])
                    total = 0
                    new_pos = np.zeros_like(pos)
                    for j in range(len(pos)):
                        cluster = np.where(pos == j)[0]
                        new_pos[cluster] = total + ncells[cluster] / 2
                        total += ncells[cluster]
                ypositions[i] = new_pos / 1000
                max_pos = max(max_pos, max(ypositions[i]))

            for i, pos in enumerate(ypositions):
                ypositions[i] += (max_pos - np.max(pos)) / 2

            # Then position the layers properly in time
            xpositions = []
            for i in range(n_layers):
                with loompy.connect(os.path.join(tempfolder,
                                                 slice_names[i])) as ds0:
                    xpositions.append(np.mean(ds0.ca.PseudoAge))

            # Now project each individual cell to the pseudolineage
            logging.info("Projecting cells to pseudolineage")
            cell_to_xy = {}
            for t in range(len(n_nodes) - 1):
                with loompy.connect(os.path.join(tempfolder,
                                                 slice_names[t])) as ds0:
                    with loompy.connect(
                            os.path.join(tempfolder,
                                         slice_names[t + 1])) as ds1:
                        for i in range(n_nodes[t + 1]):
                            if edges[t][i] != -1:
                                y1 = ypositions[t][edges[t][i]]
                                y2 = ypositions[t + 1][i]
                                offset = (xpositions[t + 1] -
                                          xpositions[t]) / 4
                                overlapping_cells = (ds1.ca[clusters] == i) & (
                                    ds1.ca.PseudoAge < slices[t + 2])
                                crs = np.array(
                                    CatmullRomSpline(
                                        n_points=100).fit_transform(
                                            np.array(
                                                [[slices[t + 1] - offset, y1],
                                                 [slices[t + 1], y1],
                                                 [slices[t + 2], y2],
                                                 [slices[t + 2] + offset,
                                                  y2]])))
                                widths = np.linspace(n_cells[t][edges[t][i]],
                                                     n_cells[t + 1][i],
                                                     num=100) / 1500
                                f = interp1d(crs[:, 0],
                                             crs[:, 1],
                                             fill_value="extrapolate")
                                fw = interp1d(crs[:, 0],
                                              widths,
                                              fill_value="extrapolate")
                                y = f(
                                    ds1.ca.PseudoAge[overlapping_cells]
                                ) + np.random.normal(
                                    scale=fw(
                                        ds1.ca.PseudoAge[overlapping_cells]) /
                                    6,
                                    size=overlapping_cells.sum())
                                for i, ix in enumerate(
                                        np.where(overlapping_cells)[0]):
                                    cell_to_xy[ds1.ca.CellID[ix]] = [
                                        ds1.ca.PseudoAge[ix], y[i]
                                    ]
                        # Draw the leftmost pseudoage slice
                        if t == 0:
                            for i in range(n_nodes[0]):
                                y1 = ypositions[0][i]
                                y2 = ypositions[0][i]
                                widths = np.linspace(n_cells[t][i],
                                                     n_cells[t][i],
                                                     num=100) / 1500
                                overlapping_cells = (ds0.ca[clusters] == i) & (
                                    ds0.ca.PseudoAge < slices[1])
                                y = y1 + np.random.normal(
                                    scale=widths[0] / 6,
                                    size=overlapping_cells.sum())
                                for i, ix in enumerate(
                                        np.where(overlapping_cells)[0]):
                                    cell_to_xy[ds1.ca.CellID[ix]] = [
                                        ds0.ca.PseudoAge[ix], y[i]
                                    ]
                        # Draw the rightmost pseudoage slice
                        if t == len(n_nodes) - 2:
                            for i in range(n_nodes[-1]):
                                y1 = ypositions[t][edges[t][i]]
                                y2 = ypositions[t + 1][i]
                                widths = np.linspace(n_cells[t][edges[t][i]],
                                                     n_cells[t + 1][i],
                                                     num=100) / 1500
                                overlapping_cells = (ds1.ca[clusters] == i) & (
                                    ds1.ca.PseudoAge > slices[-2])
                                y = y2 + np.random.normal(
                                    scale=widths[-1] / 6,
                                    size=overlapping_cells.sum())
                                for i, ix in enumerate(
                                        np.where(overlapping_cells)[0]):
                                    cell_to_xy[ds1.ca.CellID[ix]] = [
                                        ds1.ca.PseudoAge[ix], y[i]
                                    ]

            logging.info(
                "Saving pseudolineage projection back in original file")
            logging.info(ds.ca)
            return cell_to_xy
            xy = np.zeros((ds.shape[1], 2))
            for i, cellid in enumerate(cell_to_xy.keys()):
                j = np.where(ds.ca.CellID == cellid)[0]
                xy[j] = cell_to_xy[cellid]
            ds.ca.PseudoLineage = xy
    def fit(self,
            ds: loompy.LoomConnection,
            plot_file: str = None,
            report_file: str = None) -> np.ndarray:
        """
		Fit a classifier and use it to determine cluster predictive power

		Args:
			ds		Dataset
			plot_file	Filename for optional plot
			report_file	Filename for optional report

		Returns:
			Matrix of classification probabilities, shape (n_cells, n_labels)
		"""

        if "ClusterName" in ds.ca:
            cluster_names = [
                str(ds.ca.ClusterName[ds.ca.Clusters == lbl][0])
                for lbl in np.unique(ds.ca.Clusters)
            ]
        else:
            cluster_names = [str(lbl) for lbl in np.unique(ds.ca.Clusters)]

        genes = np.where(ds.ra.Selected == 1)[0]
        data = ds.sparse(rows=genes).T
        hpf = HPF(k=ds.ca.HPF.shape[1],
                  validation_fraction=0.05,
                  min_iter=10,
                  max_iter=200,
                  compute_X_ppv=False)
        hpf.fit(data)
        theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T

        train_X, test_X, train_Y, test_Y = train_test_split(theta,
                                                            ds.ca.Clusters,
                                                            test_size=0.2)
        classifier = RandomForestClassifier(max_depth=30)
        classifier.fit(train_X, train_Y)
        self.report = classification_report(test_Y,
                                            classifier.predict(test_X),
                                            labels=np.unique(ds.ca.Clusters),
                                            target_names=cluster_names)
        self.proba = classifier.predict_proba(theta)

        if plot_file is not None:
            plt.figure()
            agg = npg.aggregate(ds.ca.Clusters,
                                self.proba,
                                axis=0,
                                func="mean")
            plt.imshow(agg, cmap="viridis")
            plt.xticks(np.arange(len(cluster_names)),
                       cluster_names,
                       rotation="vertical",
                       fontsize=7)
            plt.yticks(np.arange(len(cluster_names)),
                       cluster_names,
                       rotation="horizontal",
                       fontsize=7)
            plt.xlabel("Predicted cluster")
            plt.ylabel("Ground truth cluster")
            plt.title("Cluster quality (predictive power)")
            cbar = plt.colorbar()
            cbar.set_label('Probability of predicted cluster', rotation=90)
            plt.savefig(plot_file, bbox_inches="tight")
            plt.close()
        if report_file is not None:
            with open(report_file, "w") as f:
                f.write(self.report)

        return self.proba
Exemple #26
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        logging.info(f"Normalizing and selecting {self.n_genes} genes")
        normalizer = Normalizer(False)
        normalizer.fit(ds)
        genes = FeatureSelectionByVariance(self.n_genes,
                                           mask=self.mask).fit(ds)
        self.genes = genes

        if self.factorization == 'PCA' or self.factorization == 'both' or self.batch_keys is not None:
            factorization = "PCA"
        else:
            factorization = "HPF"

        if factorization == "PCA":
            n_components = min(50, ds.shape[1])
            logging.info("PCA projection to %d components", n_components)
            pca = PCA(genes,
                      max_n_components=self.n_factors,
                      test_significance=False,
                      batch_keys=self.batch_keys)
            transformed = pca.fit_transform(ds, normalizer)
        else:
            data = ds.sparse(rows=genes).T
            # Subsample to lowest number of UMIs
            if "TotalUMI" in ds.ca:
                totals = ds.ca.TotalUMI
            else:
                totals = ds.map([np.sum], axis=1)[0]
            min_umis = int(np.min(totals))
            logging.debug(f"Subsampling to {min_umis} UMIs")
            temp = data.toarray()
            for c in range(temp.shape[0]):
                temp[c, :] = np.random.binomial(temp[c, :].astype('int32'),
                                                min_umis / totals[c])
            data = sparse.coo_matrix(temp)

            # HPF factorization
            hpf = HPF(k=self.n_factors,
                      validation_fraction=0.05,
                      min_iter=10,
                      max_iter=200,
                      compute_X_ppv=False,
                      n_threads=self.n_threads)
            hpf.fit(data)
            transformed = (
                hpf.theta.T / hpf.theta.sum(axis=1)
            ).T  # Normalize so the sums are one because JSD requires it

        # KNN in latent space
        logging.info(f"Computing KNN (k={self.k_pooling}) in latent space")
        with warnings.catch_warnings():
            warnings.simplefilter(
                "ignore", category=NumbaPerformanceWarning
            )  # Suppress warnings about numba not being able to parallelize code
            warnings.simplefilter(
                "ignore", category=NumbaPendingDeprecationWarning
            )  # Suppress warnings about future deprecations
            warnings.simplefilter(
                "ignore", category=SparseEfficiencyWarning
            )  # Suppress warnings about setting the diagonal to 1
            nn = NNDescent(data=transformed,
                           metric=(jensen_shannon_distance
                                   if factorization == "HPF" else "euclidean"))
            indices, distances = nn.query(transformed, k=self.k_pooling)
            # Note: we convert distances to similarities here, to support Poisson smoothing below
            knn = sparse.csr_matrix(
                (np.ravel(distances), np.ravel(indices),
                 np.arange(0, distances.shape[0] * distances.shape[1] + 1,
                           distances.shape[1])),
                (transformed.shape[0], transformed.shape[0]))
            max_d = knn.data.max()
            knn.data = (max_d - knn.data) / max_d
            knn.setdiag(
                1
            )  # This causes a sparse efficiency warning, but it's not a slow step relative to everything else
            self.knn = knn
Exemple #27
0
def aggregate_loom(ds: loompy.LoomConnection,
                   out_file: str,
                   select: np.ndarray,
                   group_by: str,
                   aggr_by: str,
                   aggr_ca_by: Dict[str, str],
                   return_matrix: bool = False) -> np.ndarray:
    """
	Aggregate a Loom file by applying aggregation functions to the main matrix as well as to the column attributes

	Args:
		ds			The Loom file
		out_file	The name of the output Loom file (will be appended to if it exists)
		select		Bool array giving the columns to include (or None, to include all)
		group_by	The column attribute to group by
		aggr_by 	The aggregation function for the main matrix
		aggr_ca_by	The aggregation functions for the column attributes (or None to skip)

	Remarks:
		aggr_by gives the aggregation function for the main matrix
		aggr_ca_by is a dictionary with column attributes as keys and aggregation functionas as values
		
		Aggregation functions can be any valid aggregation function from here: https://github.com/ml31415/numpy-groupies

		In addition, you can specify:
			"tally" to count the number of occurences of each value of a categorical attribute
	"""
    ca = {}  # type: Dict[str, np.ndarray]
    if select is not None:
        raise ValueError("The 'select' argument is deprecated")
    labels = (ds.ca[group_by]).astype('int')
    _, zero_strt_sort_noholes_lbls = np.unique(labels, return_inverse=True)
    n_groups = len(set(labels))
    if aggr_ca_by is not None:
        for key in ds.col_attrs.keys():
            if key not in aggr_ca_by:
                continue
            func = aggr_ca_by[key]
            if func == "tally":
                for val in set(ds.col_attrs[key]):
                    ca[key + "_" + val] = npg.aggregate(
                        zero_strt_sort_noholes_lbls,
                        (ds.col_attrs[key] == val).astype('int'),
                        func="sum",
                        fill_value=0)
            elif func == "mode":

                def mode(x):
                    return scipy.stats.mode(x)[0][0]

                ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls,
                                        ds.col_attrs[key],
                                        func=mode,
                                        fill_value=0).astype('str')
            elif func == "mean":
                ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls,
                                        ds.col_attrs[key],
                                        func=func,
                                        fill_value=0)
            elif func == "first":
                ca[key] = npg.aggregate(zero_strt_sort_noholes_lbls,
                                        ds.col_attrs[key],
                                        func=func,
                                        fill_value=ds.col_attrs[key][0])

    m = np.empty((ds.shape[0], n_groups))
    for (_, selection, view) in ds.scan(axis=0):
        vals_aggr = npg.aggregate(zero_strt_sort_noholes_lbls,
                                  view[:, :],
                                  func=aggr_by,
                                  axis=1,
                                  fill_value=0)
        m[selection, :] = vals_aggr

    if return_matrix:
        return m

    loompy.create_append(out_file, m, ds.ra, ca, fill_values="auto")
Exemple #28
0
	def fit(self, ds: loompy.LoomConnection) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
		"""
		Discover the manifold
		Args:
			n_genes		Number of genes to use for manifold learning (ignored if genes is not None)
			gtsnse		Use graph t-SNE for layout (default: standard tSNE)
			alpha		The scale parameter for multiscale KNN
			genes		List of genes to use for manifold learning

		Returns:
			knn		The multiscale knn graph as a sparse matrix, with k = 100
			mknn	Mutual knn subgraph, with k = 20
			pos		2D projection (t-SNE or gt-SNE) as ndarray with shape (n_cells, 2)
		"""
		n_valid = np.sum(ds.col_attrs["_Valid"] == 1)
		n_total = ds.shape[1]
		logging.info("%d of %d cells were valid", n_valid, n_total)
		logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0])
		cells = np.where(ds.col_attrs["_Valid"] == 1)[0]

		logging.info("Normalization")
		normalizer = cg.Normalizer(False)
		normalizer.fit(ds)

		if self.filter_cellcycle is not None:
			cell_cycle_genes = np.array(open(self.filter_cellcycle).read().split())
			mask = np.in1d(ds.ra.Gene, cell_cycle_genes)
			if np.sum(mask) == 0:
				logging.warn("None cell cycle genes where filtered, check your gene list")
		else:
			mask = None

		if self.genes is None:
			logging.info("Selecting up to %d genes", self.n_genes)
			genes = cg.FeatureSelection(self.n_genes).fit(ds, mu=normalizer.mu, sd=normalizer.sd, mask=mask)
			temp = np.zeros(ds.shape[0])
			temp[genes] = 1
			ds.set_attr("_Selected", temp, axis=0)
			logging.info("%d genes selected", temp.sum())

			n_components = min(50, n_valid)
			logging.info("PCA projection to %d components", n_components)
			pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer)
			pca_transformed = pca.fit_transform(ds, normalizer, cells=cells)
			transformed = pca_transformed

			logging.info("Generating KNN graph")
			k = min(10, n_valid - 1)
			nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
			nn.fit(transformed)
			knn = nn.kneighbors_graph(mode='connectivity')
			knn = knn.tocoo()
			mknn = knn.minimum(knn.transpose()).tocoo()

			logging.info("Louvain-Jaccard clustering")
			lj = cg.LouvainJaccard(resolution=1)
			labels = lj.fit_predict(knn)

			# Make labels for excluded cells == -1
			labels_all = np.zeros(ds.shape[1], dtype='int') + -1
			labels_all[cells] = labels
			ds.set_attr("Clusters", labels_all, axis=1)
			n_labels = np.max(labels) + 1
			logging.info("Found " + str(n_labels) + " LJ clusters")

			logging.info("Marker selection")
			(genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels), mask=mask).fit(ds)
		else:
			genes = self.genes

		temp = np.zeros(ds.shape[0])
		temp[genes] = 1
		ds.set_attr("_Selected", temp, axis=0)
		logging.info("%d genes selected", temp.sum())

		n_components = min(50, n_valid)
		logging.info("PCA projection to %d components", n_components)
		pca = cg.PCAProjection(genes, max_n_components=n_components, layer=self.layer)
		pca_transformed = pca.fit_transform(ds, normalizer, cells=cells)
		transformed = pca_transformed

		logging.info("Generating KNN graph")
		k = min(10, n_valid - 1)
		nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
		nn.fit(transformed)
		knn = nn.kneighbors_graph(mode='connectivity')
		knn = knn.tocoo()
		mknn = knn.minimum(knn.transpose()).tocoo()

		logging.info("Louvain-Jaccard clustering")
		lj = cg.LouvainJaccard(resolution=1)
		labels = lj.fit_predict(knn)

		# Make labels for excluded cells == -1
		labels_all = np.zeros(ds.shape[1], dtype='int') + -1
		labels_all[cells] = labels
		ds.set_attr("Clusters", labels_all, axis=1)
		n_labels = np.max(labels) + 1
		logging.info("Found " + str(n_labels) + " LJ clusters")

		logging.info("Marker selection")
		(genes, _, _) = cg.MarkerSelection(n_markers=int(500 / n_labels)).fit(ds)

		# Select cells across clusters more uniformly, preventing a single cluster from dominating the PCA
		cells_adjusted = cg.cap_select(labels, cells, int(n_valid * 0.2))
		n_components = min(50, cells_adjusted.shape[0])
		logging.info("PCA projection to %d components", n_components)
		pca = cg.PCAProjection(genes, max_n_components=n_components)
		pca.fit(ds, normalizer, cells=cells_adjusted)
		# Note that here we're transforming all cells; we just did the fit on the selection
		transformed = pca.transform(ds, normalizer, cells=cells)

		k = min(100, n_valid - 1)
		logging.info("Generating multiscale KNN graph (k = %d)", k)
		nn = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=4)
		nn.fit(transformed)
		knn = nn.kneighbors(return_distance=False)  # shape: (n_cells, k)
		n_cells = knn.shape[0]
		a = np.tile(np.arange(n_cells), k)
		b = np.reshape(knn.T, (n_cells * k,))
		w = np.repeat(1 / np.power(np.arange(1, k + 1), self.alpha), n_cells)
		knn = sparse.coo_matrix((w, (a, b)), shape=(n_cells, n_cells))
		threshold = w > 0.05
		mknn = sparse.coo_matrix((w[threshold], (a[threshold], b[threshold])), shape=(n_cells, n_cells))
		mknn = mknn.minimum(mknn.transpose()).tocoo()

		perplexity = min(k, (n_valid - 1) / 3 - 1)
		if self.gtsne:
			logging.info("gt-SNE layout")
			# Note that perplexity argument is ignored in this case, but must still be given
			# because bhtsne will check that it has a valid value
			tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed, knn=knn.tocsr())
		else:
			logging.info("t-SNE layout")
			tsne_pos = cg.TSNE(perplexity=perplexity).layout(transformed)
		tsne_all = np.zeros((ds.shape[1], 2), dtype='int') + np.min(tsne_pos, axis=0)
		tsne_all[cells] = tsne_pos

		# Transform back to the full set of cells
		knn = sparse.coo_matrix((knn.data, (cells[knn.row], cells[knn.col])), shape=(n_total, n_total))
		mknn = sparse.coo_matrix((mknn.data, (cells[mknn.row], cells[mknn.col])), shape=(n_total, n_total))

		return (knn, mknn, tsne_all)
Exemple #29
0
    def fit(
        self, ds: loompy.LoomConnection
    ) -> Tuple[sparse.coo_matrix, sparse.coo_matrix, np.ndarray]:
        """
		Discover the manifold

		Returns:
			knn		The knn graph as a sparse matrix
			mknn	Mutual knn subgraph
			pos		2D projection (gt-SNE) as ndarray with shape (n_cells, 2)
		"""
        n_cells = ds.shape[1]
        logging.info("Processing all %d cells", n_cells)
        logging.info("Validating genes")
        nnz = ds.map([np.count_nonzero], axis=0)[0]
        valid_genes = np.logical_and(nnz > 5,
                                     nnz < ds.shape[1] * 0.5).astype("int")
        ds.ra._Valid = valid_genes
        logging.info("%d of %d genes were valid", np.sum(ds.ra._Valid == 1),
                     ds.shape[0])

        logging.info("Normalization")
        normalizer = cg.Normalizer(False)
        normalizer.fit(ds)
        logging.info("Selecting up to %d genes", self.n_genes)
        genes = cg.FeatureSelection(self.n_genes).fit(ds,
                                                      mu=normalizer.mu,
                                                      sd=normalizer.sd)

        logging.info("Loading data for selected genes")
        data = np.zeros((n_cells, genes.shape[0]))
        for (ix, selection, view) in ds.scan(axis=1):
            data[selection - ix, :] = view[genes, :].T

        logging.info("Computing initial subspace KNN")
        subspaces = np.ones(data.shape)
        knn = subspace_knn_graph(data, subspaces)
        mknn = knn.minimum(knn.transpose()).tocoo()

        for t in range(5):
            logging.info(f"Refining subspace KNN (iteration {t + 1})")

            logging.info("Louvain clustering")
            graph = nx.from_scipy_sparse_matrix(mknn)
            partitions = community.best_partition(graph)
            labels = np.array(
                [partitions[key] for key in range(mknn.shape[0])])
            ds.ca.Clusters = labels
            n_labels = np.max(labels) + 1
            logging.info(f"Found {n_labels} clusters")

            logging.info("Marker selection")
            (_, enrichment, _) = cg.MarkerSelection(n_markers=10,
                                                    findq=False).fit(ds)
            subspaces = np.zeros(data.shape)
            for ix in range(enrichment.shape[1]):
                for j in range(n_cells):
                    subspaces[j,
                              np.argsort(-enrichment[:, ix])[:self.n_genes //
                                                             n_labels]] = 1
            knn = subspace_knn_graph(data, subspaces)
            mknn = knn.minimum(knn.transpose()).tocoo()

        perplexity = min(self.k, (n_cells - 1) / 3 - 1)
        logging.info("gt-SNE layout")
        # Note that perplexity argument is ignored in this case, but must still be given
        # because bhtsne will check that it has a valid value
        tsne_pos = cg.TSNE(perplexity=perplexity).layout(data, knn=knn.tocsr())

        return (knn, mknn, tsne_pos)
	def _fit(self, ds: loompy.LoomConnection) -> Tuple[np.ndarray, np.ndarray]:
		"""
		Finds n_markers genes per cluster using enrichment score

		Args:
			ds (LoomConnection):	Dataset

		Returns:
			ndarray of selected genes (list of ints)
			ndarray of enrichment scores
		"""
		labels = ds.ca[self.labels_attr]
		n_labels = max(labels) + 1
		n_cells = ds.shape[1]

		# Number of cells per cluster
		sizes = np.bincount(labels, minlength=n_labels)
		# Number of nonzero values per cluster
		nnz = cg.aggregate_loom(ds, None, None, self.labels_attr, np.count_nonzero, None, return_matrix=True)
		# Mean value per cluster
		means = cg.aggregate_loom(ds, None, None, self.labels_attr, "mean", None, return_matrix=True)
		# Non-zeros and means over all cells
		(nnz_overall, means_overall) = ds.map([np.count_nonzero, np.mean], axis=0)
		# Scale by number of cells
		f_nnz = nnz / sizes
		f_nnz_overall = nnz_overall / n_cells

		# Means and fraction non-zero values in other clusters (per cluster)
		means_other = ((means_overall * n_cells)[None].T - (means * sizes)) / (n_cells - sizes)
		f_nnz_other = ((f_nnz_overall * n_cells)[None].T - (f_nnz * sizes)) / (n_cells - sizes)

		# enrichment = (f_nnz + 0.1) / (f_nnz_overall[None].T + 0.1) * (means + 0.01) / (means_overall[None].T + 0.01)
		enrichment = (f_nnz + 0.1) / (f_nnz_other + 0.1) * (means + 0.01) / (means_other + 0.01)

		# Select best markers
		if "_Valid" not in ds.ra:
			logging.info("Recomputing the list of valid genes")
			nnz = ds.map([np.count_nonzero], axis=0)[0]
			valid_genes = np.logical_and(nnz > 10, nnz < ds.shape[1] * 0.6)
			ds.ra._Valid = valid_genes.astype('int')
			
		included: List[int] = []

		if self.mask is None:
			excluded = set(np.where(ds.row_attrs["_Valid"] == 0)[0])
		else:
			excluded = set(np.where(np.logical_and(ds.row_attrs["_Valid"] == 0, self.mask))[0])

		for ix in range(max(labels) + 1):
			enriched = np.argsort(enrichment[:, ix])[::-1]
			n = 0
			count = 0
			while count < self.n_markers:
				if enriched[n] in excluded:
					n += 1
					continue
				included.append(enriched[n])
				excluded.add(enriched[n])
				n += 1
				count += 1
		return (np.array(included), enrichment)