def expression_patterns(ds: loompy.LoomConnection, labels: np.ndarray, pep: float, f: float, cells: np.ndarray = None) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Derive enrichment and trinary scores for all genes Args: ds (LoomConnection): Dataset labels (numpy array): Cluster labels (one per cell) pep (float): Desired posterior error probability f (float): Fraction required for a gene to be considered 'expressed' cells (nump array): Indices of cells to include Returns: score1 (numpy 2d array): Array of (n_genes, n_labels) score2 (numpy 2d array): Array of (n_genes, n_labels) trinary (numpy 2d array): Array of (n_genes, n_labels) Remarks: If the cells argument is provided, the labels should include only those cells. That is, labels.shape[0] == cells.shape[0]. Amit says, regarding marker genes. i usually rank the genes by some kind of enrichment score. score1 = mean of gene within the cluster / mean of gene in all cells score2 = fraction of positive cells within cluster enrichment score = score1 * score2^power (where power == 0.5 or 1) i usually use 1 for 10x data """ n_labels = np.max(labels) + 1 scores1 = np.empty((ds.shape[0], n_labels)) scores2 = np.empty((ds.shape[0], n_labels)) trinary_pat = np.empty((ds.shape[0], n_labels)) trinary_prob = np.empty((ds.shape[0], n_labels)) j = 0 for (ix, selection, vals) in ds.batch_scan(cells=cells, genes=None, axis=0): # vals = normalizer.normalize(vals, selection) for j, row in enumerate(selection): data = vals[j, :] mu0 = np.mean(data) f0 = np.count_nonzero(data) score1 = np.zeros(n_labels) score2 = np.zeros(n_labels) for lbl in range(n_labels): if np.sum(labels == lbl) == 0: continue sel = data[np.where(labels == lbl)[0]] if mu0 == 0 or f0 == 0: score1[lbl] = 0 score2[lbl] = 0 else: score1[lbl] = np.mean(sel) / mu0 score2[lbl] = np.count_nonzero(sel) # f0 scores1[row, :] = score1 scores2[row, :] = score2 trinary_prob[row, :], trinary_pat[row, :] = betabinomial_trinarize_array(data, labels, pep, f) return (scores1, scores2, trinary_prob, trinary_pat)
def fit(self, ds: loompy.LoomConnection) -> np.ndarray: cells = np.where(ds.col_attrs["Clusters"] >= 0)[0] labels = ds.col_attrs["Clusters"][cells] n_labels = np.max(labels) + 1 logging.info("n_labels %d", n_labels) self.trinary_prob = np.empty((ds.shape[0], n_labels)) self.genes = ds.ra.Gene j = 0 for (ix, selection, vals) in ds.batch_scan(cells=cells, genes=None, axis=0): for j, row in enumerate(selection): data = np.round(vals[j, :]) self.trinary_prob[row, :] = self._betabinomial_trinarize_array( data, labels, self.f, n_labels) return self.trinary_prob