def fit(self, ds: loompy.LoomConnection, plot_file: str = None, report_file: str = None) -> np.ndarray: """ Fit a classifier and use it to determine cluster predictive power Args: ds Dataset plot_file Filename for optional plot report_file Filename for optional report Returns: Matrix of classification probabilities, shape (n_cells, n_labels) """ if "ClusterName" in ds.ca: cluster_names = [ str(ds.ca.ClusterName[ds.ca.Clusters == lbl][0]) for lbl in np.unique(ds.ca.Clusters) ] else: cluster_names = [str(lbl) for lbl in np.unique(ds.ca.Clusters)] genes = np.where(ds.ra.Selected == 1)[0] data = ds.sparse(rows=genes).T hpf = HPF(k=ds.ca.HPF.shape[1], validation_fraction=0.05, min_iter=10, max_iter=200, compute_X_ppv=False) hpf.fit(data) theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T train_X, test_X, train_Y, test_Y = train_test_split(theta, ds.ca.Clusters, test_size=0.2) classifier = RandomForestClassifier(max_depth=30) classifier.fit(train_X, train_Y) self.report = classification_report(test_Y, classifier.predict(test_X), labels=np.unique(ds.ca.Clusters), target_names=cluster_names) self.proba = classifier.predict_proba(theta) if plot_file is not None: plt.figure() agg = npg.aggregate(ds.ca.Clusters, self.proba, axis=0, func="mean") plt.imshow(agg, cmap="viridis") plt.xticks(np.arange(len(cluster_names)), cluster_names, rotation="vertical", fontsize=7) plt.yticks(np.arange(len(cluster_names)), cluster_names, rotation="horizontal", fontsize=7) plt.xlabel("Predicted cluster") plt.ylabel("Ground truth cluster") plt.title("Cluster quality (predictive power)") cbar = plt.colorbar() cbar.set_label('Probability of predicted cluster', rotation=90) plt.savefig(plot_file, bbox_inches="tight") plt.close() if report_file is not None: with open(report_file, "w") as f: f.write(self.report) return self.proba
def fit(self, ds: loompy.LoomConnection) -> None: logging.info(f"Normalizing and selecting {self.n_genes} genes") normalizer = Normalizer(False) normalizer.fit(ds) genes = FeatureSelectionByVariance(self.n_genes, mask=self.mask).fit(ds) self.genes = genes if self.factorization == 'PCA' or self.factorization == 'both' or self.batch_keys is not None: factorization = "PCA" else: factorization = "HPF" if factorization == "PCA": n_components = min(50, ds.shape[1]) logging.info("PCA projection to %d components", n_components) pca = PCA(genes, max_n_components=self.n_factors, test_significance=False, batch_keys=self.batch_keys) transformed = pca.fit_transform(ds, normalizer) else: data = ds.sparse(rows=genes).T # Subsample to lowest number of UMIs if "TotalUMI" in ds.ca: totals = ds.ca.TotalUMI else: totals = ds.map([np.sum], axis=1)[0] min_umis = int(np.min(totals)) logging.debug(f"Subsampling to {min_umis} UMIs") temp = data.toarray() for c in range(temp.shape[0]): temp[c, :] = np.random.binomial(temp[c, :].astype('int32'), min_umis / totals[c]) data = sparse.coo_matrix(temp) # HPF factorization hpf = HPF(k=self.n_factors, validation_fraction=0.05, min_iter=10, max_iter=200, compute_X_ppv=False, n_threads=self.n_threads) hpf.fit(data) transformed = ( hpf.theta.T / hpf.theta.sum(axis=1) ).T # Normalize so the sums are one because JSD requires it # KNN in latent space logging.info(f"Computing KNN (k={self.k_pooling}) in latent space") with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=NumbaPerformanceWarning ) # Suppress warnings about numba not being able to parallelize code warnings.simplefilter( "ignore", category=NumbaPendingDeprecationWarning ) # Suppress warnings about future deprecations warnings.simplefilter( "ignore", category=SparseEfficiencyWarning ) # Suppress warnings about setting the diagonal to 1 nn = NNDescent(data=transformed, metric=(jensen_shannon_distance if factorization == "HPF" else "euclidean")) indices, distances = nn.query(transformed, k=self.k_pooling) # Note: we convert distances to similarities here, to support Poisson smoothing below knn = sparse.csr_matrix( (np.ravel(distances), np.ravel(indices), np.arange(0, distances.shape[0] * distances.shape[1] + 1, distances.shape[1])), (transformed.shape[0], transformed.shape[0])) max_d = knn.data.max() knn.data = (max_d - knn.data) / max_d knn.setdiag( 1 ) # This causes a sparse efficiency warning, but it's not a slow step relative to everything else self.knn = knn