def testCalcSplits(self): self.logger.info("BEGIN") bbknn = bbknn_graph(None) batchCounts = [('0', 3), ('1', 4)] splitsLocations = bbknn._calcSplits(batchCounts) self.assertEqual(splitsLocations, [3, 7]) self.logger.info("splitsLocations:{}".format(splitsLocations)) D = np.array([[0, 1, 3, 6, 5, 4, 3], [12, 0, 13, 16, 15, 14, 13], [22, 21, 0, 26, 25, 24, 23], [32, 31, 33, 0, 35, 34, 33], [42, 41, 43, 46, 0, 44, 43], [52, 51, 53, 56, 55, 0, 53], [62, 61, 63, 66, 65, 64, 0]]) self.logger.info("D.shape:{}".format(D.shape)) byCols = 1 splits = np.split(D, splitsLocations, axis=byCols) # splits = np.split(D, [3,:], axis=byCols) self.logger.info("AEDWIP len(splits):{}".format(len(splits))) # np.split(D, [3,6] returns # [:3], [3:6], [6:] # we need to remove this last split del splits[-1] for split in splits: self.logger.info("AEDWIP split.shape():{}".format(split.shape)) self.logger.info("split\n{}\n".format(split)) self.assertEqual(len(splits), 2) self.logger.info("END\n")
def test_l_k_adata(self): self.logger.info("BEGIN") anndata = sc.read("../PBMC.merged.h5ad") bb6nn = bbknn_graph(anndata, neighbors_within_batch=6, runPCA=False, pcs=50) bb6nn.l_k_bbknn(l=3) l_k_d = anndata.uns['neighbors']['distances'] self.logger.info("type(l_k_d):{}".format(type(l_k_d))) self.logger.info("l_k_d.shape:{}".format(l_k_d.shape)) self.logger.info("l_k_d:{}\n".format(l_k_d)) self.logger.info("END\n")
def main(): ''' this is an optional driver for the class ''' if (len(sys.argv) != 2): sys.stderr.write("usage: " + __file__ + " <adata-file-path>\n") sys.exit(1) # read in adata object from file system adata_bbknn = sc.read(sys.argv[1]) # run bbknn on adata bbknn = bbknn_graph(adata_bbknn, neighbors_within_batch=6, runPCA=True, pcs=50) # run louvain to cluster data sc.tl.louvain(bbknn._adata) # read in AnnData object (again) adata_blknn = sc.read(sys.argv[1]) # instantiate ARIStatistic object myARIStat = ARIStatistic(adata=adata_blknn, k_per_batch=6, l=3, n_components=50, n_samples=10, bbknn_louvain=bbknn._adata.obs['louvain']) # plot ARIs pyplot.bar(range(1, len(myARIStat._results) + 1), myARIStat._results, color='black') # print statistics print('Average:', np.mean(myARIStat._results), 'SD:', np.std(myARIStat._results))
def main(): # check there is exactly two command-line arguments provided if (len(sys.argv) != 3): sys.stderr.write("usage: " + __file__ + " <adata-file-path> <number-of-samples>\n") sys.exit(1) # read in adata object from file system adata_bbknn = sc.read(sys.argv[1]) # run bbknn on adata bbknn = bbknn_graph(adata_bbknn, neighbors_within_batch=6, runPCA=True, pcs=50) # run louvain to cluster data sc.tl.louvain(bbknn._adata) # read in AnnData object (again) adata_blknn = sc.read(sys.argv[1]) # instantiate ARIStatistic object myARIStat = ARIStatistic(adata=adata_blknn, k_per_batch=6, l=3, n_components=50, n_samples=int(sys.argv[2]), bbknn_louvain=bbknn._adata.obs['louvain']) # plot ARIs plt.bar(range(1, len(myARIStat._results) + 1), myARIStat._results, color='black') # print statistics print('Average:', np.mean(myARIStat._results), 'SD:', np.std(myARIStat._results))
def testBbknn(self): self.logger.info("BEGIN") # two batches # batch0 has 3 cells # batch1 has 4 cells pairwiseDist = np.array([[0, 1, 3, 6, 5, 4, 3], [12, 0, 13, 16, 15, 14, 13], [22, 21, 0, 26, 25, 24, 23], [32, 31, 33, 0, 35, 34, 33], [42, 41, 43, 46, 0, 44, 43], [52, 51, 53, 56, 55, 0, 53], [62, 61, 63, 66, 65, 64, 0]]) expectedBB2NNIdx = np.array( [[1, 2, 6, 5], [0, 2, 6, 5], [1, 0, 6, 5], [1, 0, 6, 5], [1, 0, 6, 5], [1, 0, 6, 4], [1, 0, 5, 4]], dtype=float) expectedBB2NNDists = np.array([[1, 3, 3, 4], [12, 13, 13, 14], [21, 22, 23, 24], [31, 32, 33, 34], [41, 42, 43, 44], [51, 52, 53, 55], [61, 62, 64, 65]]) bbknn = bbknn_graph(None, neighbors_within_batch=2) batchCounts = [('0', 3), ('1', 4)] retBBKNNIdx, retBBKNNDist = bbknn._bbknn(D=pairwiseDist, batchCounts=batchCounts) self.logger.info("retBBKNNIdx:\n{}".format(retBBKNNIdx)) self.logger.info("expectedBB2NNIdx:\n{}".format(expectedBB2NNIdx)) np.testing.assert_array_equal(expectedBB2NNIdx, retBBKNNIdx) self.logger.info("retBBKNNDist:\n{}".format(retBBKNNDist)) self.logger.info("expectedBB2NNDists:\n{}".format(expectedBB2NNDists)) np.testing.assert_array_equal(expectedBB2NNDists, retBBKNNDist) self.logger.info("END\n")
def main(): # check there is exactly one command-line argument provided if (len(sys.argv) != 2): sys.stderr.write("usage: " + __file__ + " <adata-file-path>\n") sys.exit(1) # read in adata object from file system adata = sc.read(sys.argv[1]) # build knn graph myKnnGraph = KnnG(adata=adata, d_metric='euclidean', n_neighbors=15, method='umap', runPCA=True, nPC=50) # run louvain to cluster data sc.tl.louvain(myKnnGraph._adata) # run umap to project in 2-space sc.tl.umap(myKnnGraph._adata) # plot the knn graph sc.pl.umap(myKnnGraph._adata, color=['louvain']) # ## 3.b. [5 pts] # Cluster the integrated dataset using the Louvain method. Re-cluster the data now that you’ve attempted to remove the # batch effect. Turn in a UMAP plot showing the integrated dataset and color the cells in the plot by their Louvain # cluster assignments. # # read in ann data file anndata = sc.read(sys.argv[1]) # run our implementation of nearest neighboors and update anndata myBBknnGraph = bbknn_graph(adata=anndata, neighbors_within_batch=6, runPCA=True, pcs=50) # create louvain clusters sc.tl.louvain(myBBknnGraph._adata, flavor='igraph', directed=False, use_weights=True) # project data into 2 dimensions sc.tl.umap(myBBknnGraph._adata) # display graph of louvain clusters sc.pl.umap(myBBknnGraph._adata, color=['louvain']) # ## 3.c. [10 pts] # Quantitatively estimate the degree to which the bb-k-NNG removed the batch # effect using the F-statistic described above. Calculate the F statistic using the UMAP # solution derived from the original, non-batch balanced 12-k-NNG. Then calculate the F-statistic # using the bb-6-NNG to make the UMAP solution. Report both F-statistics. Do you see an # improvement in the batch correction using the bb-k-NNG? # # instantiate and calculate F statistic for non-batch-balanced clusters nonbbFstat = FStatistic(myKnnGraph._adata) print("non-batch-balanced f stat: " + str(nonbbFstat.calculate_F_statistic())) # instantiate and calculate F statistic for batch balanced clusters bbFstat = FStatistic(myBBknnGraph._adata) print("batch-balanced f stat: " + str(bbFstat.calculate_F_statistic()))
def test_l_k_bbknn(self): self.logger.info("BEGIN") # two batches # batch0 has 3 cells # batch1 has 3 cells # pairwiseDist=np. array([ # [2,3,4,6,5,4,3], # [12,11,13,16,15,14,13], # [22,21,23,26,25,24,23], # [32,31,33,36,35,34,33], # [42,41,43,46,45,44,43], # [52,51,53,56,55,54,53], # [62,61,63,66,65,64,63] # ]) bb2nnIdx = np.array([[1, 2, 6, 4], [1, 2, 6, 4], [1, 2, 6, 4], [1, 2, 6, 4], [1, 2, 6, 4], [1, 2, 6, 4], [1, 2, 6, 4]]) bb2nnDists = np.array([[1, 2, 3, 4], [11, 12, 13, 14], [21, 22, 23, 24], [31, 32, 33, 34], [41, 42, 43, 44], [51, 52, 53, 54], [61, 62, 63, 64]]) bbknn = bbknn_graph(adata=None, neighbors_within_batch=2, pcs=None, method=None) # init bbknn with our test data bbknn._knn_indices = bb2nnIdx bbknn._knn_distances = bb2nnDists bbknn._numBatches = 2 bbknn._l_k_bbknnImplementation(l=1) # get results retl_knn_indices = bbknn._l_knn_indices self.logger.info("retl_knn_indices:\n{}".format(retl_knn_indices)) retl_knn_distances = bbknn._l_knn_distances self.logger.info("retl_knn_distances:\n{}".format(retl_knn_distances)) expectedIdx = np.array([[1, 4], [1, 6], [1, 4], [1, 6], [1, 4], [1, 6], [1, 6]]) np.testing.assert_array_equal(expectedIdx, retl_knn_indices) expectedDist = np.array([[1., 4.], [11., 13.], [21., 24.], [31., 33.], [41., 44.], [51., 53.], [61., 63.]]) np.testing.assert_array_equal(expectedDist, retl_knn_distances) # make sure random sub setting works as expected bbknn._l_k_bbknnImplementation(l=1) retl_knn_indices2 = bbknn._l_knn_indices self.logger.info("retl_knn_indices2:\n{}".format(retl_knn_indices2)) expectedIdx2 = np.array([[2, 6], [2, 4], [2, 6], [2, 6], [2, 4], [2, 4], [2, 4]]) np.testing.assert_array_equal(expectedIdx2, retl_knn_indices2) retl_knn_distances2 = bbknn._l_knn_distances self.logger.info( "retl_knn_distances2:\n{}".format(retl_knn_distances2)) expectedDist2 = np.array([[2., 3.], [12., 14.], [22., 23.], [32., 33.], [42., 44.], [52., 54.], [62., 64.]]) np.testing.assert_array_equal(expectedDist2, retl_knn_distances2) self.logger.info("END\n")
def l_k_bbknn(self): ''' this method makes an l subsampling of the batch-balanced neighbors from bbknn if k=6 and l=3, then subsample 3 neighbors from batch1 and 3 neighbors from batch2 ''' # get list of batch identifiers batch_unique = self._adata.obs.batch.cat.categories # run pca self._runPCA() # create bbknn_graph bbknn = bbknn_graph( self._adata, neighbors_within_batch=self._neighbors_within_batch, runPCA=False, pcs=self._n_components) # create indices for random sampling l from k random_sample = random.sample(range(0, self._neighbors_within_batch), self._l) # initialize lk_bbknn l_bbknn_indices and l_bbknn_distances matrices to 0's l_bbknn_indices = np.zeros((bbknn._knn_indices.shape[0], len(batch_unique) * self._l)).astype(int) l_bbknn_distances = np.zeros( (bbknn._knn_distances.shape[0], len(batch_unique) * self._l)) # outer loop through batches for i in range(len(batch_unique)): # get batch id for ref_batch batch_id = batch_unique[i] # get booleen index for reference batch bool_idx = self._adata.obs['batch'] == batch_id # use booleen index to get pca data for reference batch ref_batch_pca = self._adata.obsm['X_pca'][bool_idx] # create a booleen index for ref_batch to map back to pca matrix ref_batch_idx = np.arange(self._adata.shape[0])[bool_idx] # inner loop through batches for j in range(len(batch_unique)): # get batch id for query_batch batch_id = batch_unique[j] # get booleen index for query batch bool_idx = self._adata.obs['batch'] == batch_id # use booleen index to get pca data for query batch query_batch_pca = self._adata.obsm['X_pca'][bool_idx] # create a booleen index for query_batch to map back to pca matrix query_batch_idx = np.arange(self._adata.shape[0])[bool_idx] # calculate pairwise_distances between query batch and ref_batch D = pairwise_distances(X=query_batch_pca, Y=ref_batch_pca) # get indices for n nearest neighbors neighbors = np.argsort(D, axis=1)[0:, 0:self._neighbors_within_batch] # get distance for n nearest neighbors (including self) sorted_D = np.sort(D, axis=1)[0:, 0:self._neighbors_within_batch] # map nearest neighbors to pca indices for n in range(neighbors.shape[0]): for k in range(neighbors.shape[1]): temp_neighbor = neighbors[n, k] neighbors[n, k] = ref_batch_idx[temp_neighbor] # set range of columns for indices and distances col_range = np.arange(i * self._l, (i + 1) * self._l) # pass random sampled l nearest neighbors to l_bbknn_indices and distances matrix l_bbknn_indices[query_batch_idx[:, None], col_range[None, :]] = neighbors[:, random_sample] l_bbknn_distances[query_batch_idx[:, None], col_range[None, :]] = sorted_D[:, random_sample] # calculate connectivities and distances using scanpy method distances, connectivities = sc.neighbors.compute_connectivities_umap( l_bbknn_indices, l_bbknn_distances, n_obs=len(self._adata.obs), n_neighbors=2 * self._l) # set connectivities and distances in adata object self._adata.uns['neighbors']['connectivities'] = connectivities self._adata.uns['neighbors']['distances'] = distances