def fit(self,
            ds: loompy.LoomConnection,
            plot_file: str = None,
            report_file: str = None) -> np.ndarray:
        """
		Fit a classifier and use it to determine cluster predictive power

		Args:
			ds		Dataset
			plot_file	Filename for optional plot
			report_file	Filename for optional report

		Returns:
			Matrix of classification probabilities, shape (n_cells, n_labels)
		"""

        if "ClusterName" in ds.ca:
            cluster_names = [
                str(ds.ca.ClusterName[ds.ca.Clusters == lbl][0])
                for lbl in np.unique(ds.ca.Clusters)
            ]
        else:
            cluster_names = [str(lbl) for lbl in np.unique(ds.ca.Clusters)]

        genes = np.where(ds.ra.Selected == 1)[0]
        data = ds.sparse(rows=genes).T
        hpf = HPF(k=ds.ca.HPF.shape[1],
                  validation_fraction=0.05,
                  min_iter=10,
                  max_iter=200,
                  compute_X_ppv=False)
        hpf.fit(data)
        theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T

        train_X, test_X, train_Y, test_Y = train_test_split(theta,
                                                            ds.ca.Clusters,
                                                            test_size=0.2)
        classifier = RandomForestClassifier(max_depth=30)
        classifier.fit(train_X, train_Y)
        self.report = classification_report(test_Y,
                                            classifier.predict(test_X),
                                            labels=np.unique(ds.ca.Clusters),
                                            target_names=cluster_names)
        self.proba = classifier.predict_proba(theta)

        if plot_file is not None:
            plt.figure()
            agg = npg.aggregate(ds.ca.Clusters,
                                self.proba,
                                axis=0,
                                func="mean")
            plt.imshow(agg, cmap="viridis")
            plt.xticks(np.arange(len(cluster_names)),
                       cluster_names,
                       rotation="vertical",
                       fontsize=7)
            plt.yticks(np.arange(len(cluster_names)),
                       cluster_names,
                       rotation="horizontal",
                       fontsize=7)
            plt.xlabel("Predicted cluster")
            plt.ylabel("Ground truth cluster")
            plt.title("Cluster quality (predictive power)")
            cbar = plt.colorbar()
            cbar.set_label('Probability of predicted cluster', rotation=90)
            plt.savefig(plot_file, bbox_inches="tight")
            plt.close()
        if report_file is not None:
            with open(report_file, "w") as f:
                f.write(self.report)

        return self.proba
Beispiel #2
0
    def fit(self, ds: loompy.LoomConnection) -> None:
        logging.info(f"Normalizing and selecting {self.n_genes} genes")
        normalizer = Normalizer(False)
        normalizer.fit(ds)
        genes = FeatureSelectionByVariance(self.n_genes,
                                           mask=self.mask).fit(ds)
        self.genes = genes

        if self.factorization == 'PCA' or self.factorization == 'both' or self.batch_keys is not None:
            factorization = "PCA"
        else:
            factorization = "HPF"

        if factorization == "PCA":
            n_components = min(50, ds.shape[1])
            logging.info("PCA projection to %d components", n_components)
            pca = PCA(genes,
                      max_n_components=self.n_factors,
                      test_significance=False,
                      batch_keys=self.batch_keys)
            transformed = pca.fit_transform(ds, normalizer)
        else:
            data = ds.sparse(rows=genes).T
            # Subsample to lowest number of UMIs
            if "TotalUMI" in ds.ca:
                totals = ds.ca.TotalUMI
            else:
                totals = ds.map([np.sum], axis=1)[0]
            min_umis = int(np.min(totals))
            logging.debug(f"Subsampling to {min_umis} UMIs")
            temp = data.toarray()
            for c in range(temp.shape[0]):
                temp[c, :] = np.random.binomial(temp[c, :].astype('int32'),
                                                min_umis / totals[c])
            data = sparse.coo_matrix(temp)

            # HPF factorization
            hpf = HPF(k=self.n_factors,
                      validation_fraction=0.05,
                      min_iter=10,
                      max_iter=200,
                      compute_X_ppv=False,
                      n_threads=self.n_threads)
            hpf.fit(data)
            transformed = (
                hpf.theta.T / hpf.theta.sum(axis=1)
            ).T  # Normalize so the sums are one because JSD requires it

        # KNN in latent space
        logging.info(f"Computing KNN (k={self.k_pooling}) in latent space")
        with warnings.catch_warnings():
            warnings.simplefilter(
                "ignore", category=NumbaPerformanceWarning
            )  # Suppress warnings about numba not being able to parallelize code
            warnings.simplefilter(
                "ignore", category=NumbaPendingDeprecationWarning
            )  # Suppress warnings about future deprecations
            warnings.simplefilter(
                "ignore", category=SparseEfficiencyWarning
            )  # Suppress warnings about setting the diagonal to 1
            nn = NNDescent(data=transformed,
                           metric=(jensen_shannon_distance
                                   if factorization == "HPF" else "euclidean"))
            indices, distances = nn.query(transformed, k=self.k_pooling)
            # Note: we convert distances to similarities here, to support Poisson smoothing below
            knn = sparse.csr_matrix(
                (np.ravel(distances), np.ravel(indices),
                 np.arange(0, distances.shape[0] * distances.shape[1] + 1,
                           distances.shape[1])),
                (transformed.shape[0], transformed.shape[0]))
            max_d = knn.data.max()
            knn.data = (max_d - knn.data) / max_d
            knn.setdiag(
                1
            )  # This causes a sparse efficiency warning, but it's not a slow step relative to everything else
            self.knn = knn