Esempio n. 1
0
    def testCalcSplits(self):
        self.logger.info("BEGIN")

        bbknn = bbknn_graph(None)
        batchCounts = [('0', 3), ('1', 4)]
        splitsLocations = bbknn._calcSplits(batchCounts)
        self.assertEqual(splitsLocations, [3, 7])
        self.logger.info("splitsLocations:{}".format(splitsLocations))

        D = np.array([[0, 1, 3, 6, 5, 4, 3], [12, 0, 13, 16, 15, 14, 13],
                      [22, 21, 0, 26, 25, 24, 23], [32, 31, 33, 0, 35, 34, 33],
                      [42, 41, 43, 46, 0, 44, 43], [52, 51, 53, 56, 55, 0, 53],
                      [62, 61, 63, 66, 65, 64, 0]])
        self.logger.info("D.shape:{}".format(D.shape))

        byCols = 1
        splits = np.split(D, splitsLocations, axis=byCols)
        #         splits = np.split(D, [3,:], axis=byCols)
        self.logger.info("AEDWIP len(splits):{}".format(len(splits)))

        # np.split(D, [3,6] returns
        # [:3], [3:6], [6:]
        # we need to remove this last split
        del splits[-1]
        for split in splits:
            self.logger.info("AEDWIP split.shape():{}".format(split.shape))
            self.logger.info("split\n{}\n".format(split))

        self.assertEqual(len(splits), 2)

        self.logger.info("END\n")
Esempio n. 2
0
    def test_l_k_adata(self):
        self.logger.info("BEGIN")

        anndata = sc.read("../PBMC.merged.h5ad")
        bb6nn = bbknn_graph(anndata,
                            neighbors_within_batch=6,
                            runPCA=False,
                            pcs=50)
        bb6nn.l_k_bbknn(l=3)

        l_k_d = anndata.uns['neighbors']['distances']
        self.logger.info("type(l_k_d):{}".format(type(l_k_d)))
        self.logger.info("l_k_d.shape:{}".format(l_k_d.shape))
        self.logger.info("l_k_d:{}\n".format(l_k_d))

        self.logger.info("END\n")
Esempio n. 3
0
def main():
    '''
    this is an optional driver for the class
    '''
    if (len(sys.argv) != 2):
        sys.stderr.write("usage: " + __file__ + " <adata-file-path>\n")
        sys.exit(1)

    # read in adata object from file system
    adata_bbknn = sc.read(sys.argv[1])

    # run bbknn on adata
    bbknn = bbknn_graph(adata_bbknn,
                        neighbors_within_batch=6,
                        runPCA=True,
                        pcs=50)

    # run louvain to cluster data
    sc.tl.louvain(bbknn._adata)

    # read in AnnData object (again)
    adata_blknn = sc.read(sys.argv[1])

    # instantiate ARIStatistic object
    myARIStat = ARIStatistic(adata=adata_blknn,
                             k_per_batch=6,
                             l=3,
                             n_components=50,
                             n_samples=10,
                             bbknn_louvain=bbknn._adata.obs['louvain'])

    # plot ARIs
    pyplot.bar(range(1,
                     len(myARIStat._results) + 1),
               myARIStat._results,
               color='black')

    # print statistics
    print('Average:', np.mean(myARIStat._results), 'SD:',
          np.std(myARIStat._results))
Esempio n. 4
0
def main():
    # check there is exactly two command-line arguments provided
    if (len(sys.argv) != 3):
        sys.stderr.write("usage: " + __file__ +
                         " <adata-file-path> <number-of-samples>\n")
        sys.exit(1)

    # read in adata object from file system
    adata_bbknn = sc.read(sys.argv[1])

    # run bbknn on adata
    bbknn = bbknn_graph(adata_bbknn,
                        neighbors_within_batch=6,
                        runPCA=True,
                        pcs=50)

    # run louvain to cluster data
    sc.tl.louvain(bbknn._adata)

    # read in AnnData object (again)
    adata_blknn = sc.read(sys.argv[1])

    # instantiate ARIStatistic object
    myARIStat = ARIStatistic(adata=adata_blknn,
                             k_per_batch=6,
                             l=3,
                             n_components=50,
                             n_samples=int(sys.argv[2]),
                             bbknn_louvain=bbknn._adata.obs['louvain'])

    # plot ARIs
    plt.bar(range(1,
                  len(myARIStat._results) + 1),
            myARIStat._results,
            color='black')

    # print statistics
    print('Average:', np.mean(myARIStat._results), 'SD:',
          np.std(myARIStat._results))
Esempio n. 5
0
    def testBbknn(self):
        self.logger.info("BEGIN")
        # two batches
        # batch0 has 3 cells
        # batch1 has 4 cells
        pairwiseDist = np.array([[0, 1, 3, 6, 5, 4, 3],
                                 [12, 0, 13, 16, 15, 14, 13],
                                 [22, 21, 0, 26, 25, 24, 23],
                                 [32, 31, 33, 0, 35, 34, 33],
                                 [42, 41, 43, 46, 0, 44, 43],
                                 [52, 51, 53, 56, 55, 0, 53],
                                 [62, 61, 63, 66, 65, 64, 0]])

        expectedBB2NNIdx = np.array(
            [[1, 2, 6, 5], [0, 2, 6, 5], [1, 0, 6, 5], [1, 0, 6, 5],
             [1, 0, 6, 5], [1, 0, 6, 4], [1, 0, 5, 4]],
            dtype=float)

        expectedBB2NNDists = np.array([[1, 3, 3, 4], [12, 13, 13, 14],
                                       [21, 22, 23, 24], [31, 32, 33, 34],
                                       [41, 42, 43, 44], [51, 52, 53, 55],
                                       [61, 62, 64, 65]])

        bbknn = bbknn_graph(None, neighbors_within_batch=2)

        batchCounts = [('0', 3), ('1', 4)]
        retBBKNNIdx, retBBKNNDist = bbknn._bbknn(D=pairwiseDist,
                                                 batchCounts=batchCounts)
        self.logger.info("retBBKNNIdx:\n{}".format(retBBKNNIdx))
        self.logger.info("expectedBB2NNIdx:\n{}".format(expectedBB2NNIdx))
        np.testing.assert_array_equal(expectedBB2NNIdx, retBBKNNIdx)

        self.logger.info("retBBKNNDist:\n{}".format(retBBKNNDist))
        self.logger.info("expectedBB2NNDists:\n{}".format(expectedBB2NNDists))
        np.testing.assert_array_equal(expectedBB2NNDists, retBBKNNDist)

        self.logger.info("END\n")
Esempio n. 6
0
def main():
    # check there is exactly one command-line argument provided
    if (len(sys.argv) != 2):
        sys.stderr.write("usage: " + __file__ + " <adata-file-path>\n")
        sys.exit(1)

    # read in adata object from file system
    adata = sc.read(sys.argv[1])

    # build knn graph
    myKnnGraph = KnnG(adata=adata,
                      d_metric='euclidean',
                      n_neighbors=15,
                      method='umap',
                      runPCA=True,
                      nPC=50)

    # run louvain to cluster data
    sc.tl.louvain(myKnnGraph._adata)

    # run umap to project in 2-space
    sc.tl.umap(myKnnGraph._adata)

    # plot the knn graph
    sc.pl.umap(myKnnGraph._adata, color=['louvain'])

    # ## 3.b. [5 pts]
    # Cluster the integrated dataset using the Louvain method. Re-cluster the data now that you’ve attempted to remove the
    # batch effect. Turn in a UMAP plot showing the integrated dataset and color the cells in the plot by their Louvain
    # cluster assignments.
    #
    # read in ann data file
    anndata = sc.read(sys.argv[1])

    # run our implementation of nearest neighboors and update anndata
    myBBknnGraph = bbknn_graph(adata=anndata,
                               neighbors_within_batch=6,
                               runPCA=True,
                               pcs=50)

    # create louvain clusters
    sc.tl.louvain(myBBknnGraph._adata,
                  flavor='igraph',
                  directed=False,
                  use_weights=True)

    # project data into 2 dimensions
    sc.tl.umap(myBBknnGraph._adata)

    # display graph of louvain clusters
    sc.pl.umap(myBBknnGraph._adata, color=['louvain'])

    # ## 3.c. [10 pts]
    # Quantitatively estimate the degree to which the bb-k-NNG removed the batch
    # effect using the F-statistic described above. Calculate the F statistic using the UMAP
    # solution derived from the original, non-batch balanced 12-k-NNG. Then calculate the F-statistic
    # using the bb-6-NNG to make the UMAP solution. Report both F-statistics. Do you see an
    # improvement in the batch correction using the bb-k-NNG?
    #

    # instantiate and calculate F statistic for non-batch-balanced clusters
    nonbbFstat = FStatistic(myKnnGraph._adata)
    print("non-batch-balanced f stat: " +
          str(nonbbFstat.calculate_F_statistic()))

    # instantiate and calculate F statistic for batch balanced clusters
    bbFstat = FStatistic(myBBknnGraph._adata)
    print("batch-balanced f stat: " + str(bbFstat.calculate_F_statistic()))
Esempio n. 7
0
    def test_l_k_bbknn(self):
        self.logger.info("BEGIN")

        # two batches
        # batch0 has 3 cells
        # batch1 has 3 cells

        #         pairwiseDist=np. array([
        #                                 [2,3,4,6,5,4,3],
        #                                 [12,11,13,16,15,14,13],
        #                                 [22,21,23,26,25,24,23],
        #                                 [32,31,33,36,35,34,33],
        #                                 [42,41,43,46,45,44,43],
        #                                 [52,51,53,56,55,54,53],
        #                                 [62,61,63,66,65,64,63]
        #                                 ])

        bb2nnIdx = np.array([[1, 2, 6, 4], [1, 2, 6, 4], [1, 2, 6, 4],
                             [1, 2, 6, 4], [1, 2, 6, 4], [1, 2, 6, 4],
                             [1, 2, 6, 4]])

        bb2nnDists = np.array([[1, 2, 3, 4], [11, 12, 13, 14],
                               [21, 22, 23, 24], [31, 32, 33, 34],
                               [41, 42, 43, 44], [51, 52, 53, 54],
                               [61, 62, 63, 64]])

        bbknn = bbknn_graph(adata=None,
                            neighbors_within_batch=2,
                            pcs=None,
                            method=None)

        # init bbknn with our test data
        bbknn._knn_indices = bb2nnIdx
        bbknn._knn_distances = bb2nnDists
        bbknn._numBatches = 2

        bbknn._l_k_bbknnImplementation(l=1)

        # get results
        retl_knn_indices = bbknn._l_knn_indices
        self.logger.info("retl_knn_indices:\n{}".format(retl_knn_indices))

        retl_knn_distances = bbknn._l_knn_distances
        self.logger.info("retl_knn_distances:\n{}".format(retl_knn_distances))

        expectedIdx = np.array([[1, 4], [1, 6], [1, 4], [1, 6], [1, 4], [1, 6],
                                [1, 6]])
        np.testing.assert_array_equal(expectedIdx, retl_knn_indices)

        expectedDist = np.array([[1., 4.], [11., 13.], [21., 24.], [31., 33.],
                                 [41., 44.], [51., 53.], [61., 63.]])
        np.testing.assert_array_equal(expectedDist, retl_knn_distances)

        # make sure random sub setting works as expected
        bbknn._l_k_bbknnImplementation(l=1)
        retl_knn_indices2 = bbknn._l_knn_indices
        self.logger.info("retl_knn_indices2:\n{}".format(retl_knn_indices2))

        expectedIdx2 = np.array([[2, 6], [2, 4], [2, 6], [2, 6], [2, 4],
                                 [2, 4], [2, 4]])

        np.testing.assert_array_equal(expectedIdx2, retl_knn_indices2)

        retl_knn_distances2 = bbknn._l_knn_distances
        self.logger.info(
            "retl_knn_distances2:\n{}".format(retl_knn_distances2))
        expectedDist2 = np.array([[2., 3.], [12., 14.], [22., 23.], [32., 33.],
                                  [42., 44.], [52., 54.], [62., 64.]])

        np.testing.assert_array_equal(expectedDist2, retl_knn_distances2)

        self.logger.info("END\n")
Esempio n. 8
0
    def l_k_bbknn(self):
        '''
        this method makes an l subsampling of the batch-balanced neighbors from bbknn

        if k=6 and l=3, then subsample 3 neighbors from batch1 and 3 neighbors from batch2
        '''

        # get list of batch identifiers
        batch_unique = self._adata.obs.batch.cat.categories

        # run pca
        self._runPCA()

        # create bbknn_graph
        bbknn = bbknn_graph(
            self._adata,
            neighbors_within_batch=self._neighbors_within_batch,
            runPCA=False,
            pcs=self._n_components)

        # create indices for random sampling l from k
        random_sample = random.sample(range(0, self._neighbors_within_batch),
                                      self._l)

        # initialize lk_bbknn l_bbknn_indices and l_bbknn_distances matrices to 0's
        l_bbknn_indices = np.zeros((bbknn._knn_indices.shape[0],
                                    len(batch_unique) * self._l)).astype(int)
        l_bbknn_distances = np.zeros(
            (bbknn._knn_distances.shape[0], len(batch_unique) * self._l))

        # outer loop through batches
        for i in range(len(batch_unique)):
            # get batch id for ref_batch
            batch_id = batch_unique[i]

            # get booleen index for reference batch
            bool_idx = self._adata.obs['batch'] == batch_id

            # use booleen index to get pca data for reference batch
            ref_batch_pca = self._adata.obsm['X_pca'][bool_idx]

            # create a booleen index for ref_batch to map back to pca matrix
            ref_batch_idx = np.arange(self._adata.shape[0])[bool_idx]

            # inner loop through batches
            for j in range(len(batch_unique)):
                # get batch id for query_batch
                batch_id = batch_unique[j]

                # get booleen index for query batch
                bool_idx = self._adata.obs['batch'] == batch_id

                # use booleen index to get pca data for query batch
                query_batch_pca = self._adata.obsm['X_pca'][bool_idx]

                # create a booleen index for query_batch to map back to pca matrix
                query_batch_idx = np.arange(self._adata.shape[0])[bool_idx]

                # calculate pairwise_distances between query batch and ref_batch
                D = pairwise_distances(X=query_batch_pca, Y=ref_batch_pca)

                # get indices for n nearest neighbors
                neighbors = np.argsort(D,
                                       axis=1)[0:,
                                               0:self._neighbors_within_batch]

                # get distance for n nearest neighbors (including self)
                sorted_D = np.sort(D, axis=1)[0:,
                                              0:self._neighbors_within_batch]

                # map nearest neighbors to pca indices
                for n in range(neighbors.shape[0]):
                    for k in range(neighbors.shape[1]):
                        temp_neighbor = neighbors[n, k]
                        neighbors[n, k] = ref_batch_idx[temp_neighbor]

                # set range of columns for indices and distances
                col_range = np.arange(i * self._l, (i + 1) * self._l)

                # pass random sampled l nearest neighbors to l_bbknn_indices and distances matrix
                l_bbknn_indices[query_batch_idx[:, None],
                                col_range[None, :]] = neighbors[:,
                                                                random_sample]
                l_bbknn_distances[query_batch_idx[:, None],
                                  col_range[None, :]] = sorted_D[:,
                                                                 random_sample]

        # calculate connectivities and distances using scanpy method
        distances, connectivities = sc.neighbors.compute_connectivities_umap(
            l_bbknn_indices,
            l_bbknn_distances,
            n_obs=len(self._adata.obs),
            n_neighbors=2 * self._l)

        # set connectivities and distances in adata object
        self._adata.uns['neighbors']['connectivities'] = connectivities
        self._adata.uns['neighbors']['distances'] = distances