Exemple #1
0
def find_edges(input, test, K):
    print(f"building kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type <= 3:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=10)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("finding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    else:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((index1, index2))
    print(f"done! .... time={time.time()-st_time:.3f}s")
    return edge_list
def doublet_finder(ds: loompy.LoomConnection,
                   use_pca: bool = False,
                   proportion_artificial: float = 0.20,
                   fixed_th: float = None,
                   k: int = None,
                   name: object = "tmp",
                   qc_dir: object = ".",
                   graphs: bool = True,
                   max_th: float = 1) -> np.ndarray:
    # Step 1: Generate artificial doublets from input
    logging.debug("Creating artificial doublets")
    n_real_cells = ds.shape[1]
    n_doublets = int(n_real_cells / (1 - proportion_artificial) - n_real_cells)
    doublets = np.zeros((ds.shape[0], n_doublets))
    for i in range(n_doublets):
        a = np.random.choice(ds.shape[1])
        b = np.random.choice(ds.shape[1])
        doublets[:, i] = ds[:, a] + ds[:, b]

    data_wdoublets = np.concatenate((ds[:, :], doublets), axis=1)

    logging.debug("Feature selection and dimensionality reduction")
    genes = FeatureSelectionByVariance(2000).fit(ds)
    if use_pca:
        # R function uses log2 counts/million
        f = np.divide(data_wdoublets.sum(axis=0), 10e6)
        norm_data = np.divide(data_wdoublets, f)
        norm_data = np.log(norm_data + 1)
        pca = PCA(n_components=50).fit_transform(norm_data[genes, :].T)
    else:
        data = sparse.coo_matrix(data_wdoublets[genes, :]).T
        hpf = HPF(k=64,
                  validation_fraction=0.05,
                  min_iter=10,
                  max_iter=200,
                  compute_X_ppv=False)
        hpf.fit(data)
        theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T

    if k is None:
        k = int(np.min([100, ds.shape[1] * 0.01]))

    logging.info(f"Initialize NN structure with k = {k}")
    if use_pca:
        knn_result = NearestNeighbors(n_neighbors=k,
                                      metric='euclidean',
                                      n_jobs=4)
        knn_result.fit(pca)
        knn_dist, knn_idx = knn_result.kneighbors(X=pca, return_distance=True)

        num = ds.shape[1]
        knn_result1 = NearestNeighbors(n_neighbors=k,
                                       metric='euclidean',
                                       n_jobs=4)
        knn_result1.fit(pca[0:num, :])
        knn_dist1, knn_idx1 = knn_result1.kneighbors(X=pca[num + 1:, :],
                                                     n_neighbors=10)
        knn_dist_rc, knn_idx_rc = knn_result1.kneighbors(X=pca[0:num, :],
                                                         return_distance=True)

    else:
        knn_result = NNDescent(data=theta, metric=jensen_shannon_distance)
        knn_idx, knn_dist = knn_result.query(theta, k=k)

        num = ds.shape[1]
        knn_result1 = NNDescent(data=theta[0:num, :],
                                metric=jensen_shannon_distance)
        knn_idx1, knn_dist1 = knn_result1.query(theta[num + 1:, :], k=10)
        knn_idx_rc, knn_dist_rc = knn_result1.query(theta[0:num, :], k=k)

    dist_th = np.mean(knn_dist1.flatten()) + 1.64 * np.std(knn_dist1.flatten())

    doublet_freq = np.logical_and(knn_idx > ds.shape[1], knn_dist < dist_th)
    doublet_freq_A = doublet_freq[ds.shape[1]:ds.shape[1] + n_doublets, :]
    mean1 = doublet_freq_A.mean(axis=1)
    mean2 = doublet_freq_A[:, 0:int(np.ceil(k / 2))].mean(axis=1)
    doublet_score_A = np.maximum(mean1, mean2)

    doublet_freq = doublet_freq[0:ds.shape[1], :]
    mean1 = doublet_freq.mean(axis=1)
    mean2 = doublet_freq[:, 0:int(np.ceil(k / 2))].mean(axis=1)
    doublet_score = np.maximum(mean1, mean2)
    doublet_flag = np.zeros(ds.shape[1], int)
    doublet_th1 = 1
    doublet_th2 = 1
    doublet_th = 1
    #Infer TH from the data or use fixed TH

    # instantiate and fit the KDE model
    kde = KernelDensity(bandwidth=0.1, kernel='gaussian')
    kde.fit(doublet_score_A[:, None])

    # score_samples returns the log of the probability density
    xx = np.linspace(doublet_score_A.min(), doublet_score_A.max(),
                     len(doublet_score_A)).reshape(-1, 1)

    logprob = kde.score_samples(xx)
    if fixed_th is not None:
        doublet_th = float(fixed_th)
    else:
        #Check if the distribution is bimodal
        intervals = UniDip(np.exp(logprob)).run()
        if (len(intervals) > 1):
            kmeans = KMeans(n_clusters=2).fit(
                doublet_score_A.reshape(len(doublet_score_A), 1))
            high_cluster = np.where(
                kmeans.cluster_centers_ == max(kmeans.cluster_centers_))[0][0]
            doublet_th1 = np.around(np.min(
                doublet_score_A[kmeans.labels_ == high_cluster]),
                                    decimals=3)

        #0.5% for every 1000 cells - the rate of detectable doublets by 10X
        doublet_th2 = np.percentile(doublet_score, 100 - (5e-4 * ds.shape[1]))
        doublet_th2 = np.around(doublet_th2, decimals=3)
        #The TH shouldn't be higher than indicated
        if doublet_th2 > max_th:
            doublet_th2 = max_th
        if doublet_th1 > max_th:
            doublet_th1 = max_th
        if (len(np.where(doublet_score >= doublet_th1)[0]) >
            (len(np.where(doublet_score >= doublet_th2)[0]))):
            doublet_th = doublet_th2
        else:
            doublet_th = doublet_th1
    doublet_flag[doublet_score >= doublet_th] = 1

    #Calculate the score for the cells that are nn of the marked doublets
    if use_pca:
        pca_rc = pca[0:num, :]
        knn_dist1_rc, knn_idx1_rc = knn_result1.kneighbors(
            X=pca_rc[doublet_flag == 1, :],
            n_neighbors=10,
            return_distance=True)
    else:
        theta_rc = theta[0:num, :]
        knn_idx1_rc, knn_dist1_rc = knn_result1.query(
            theta_rc[doublet_flag == 1, :], k=10)

    dist_th = np.mean(
        knn_dist1_rc.flatten()) + 1.64 * np.std(knn_dist1_rc.flatten())
    doublet2_freq = np.logical_and(doublet_flag[knn_idx_rc] == 1,
                                   knn_dist_rc < dist_th)
    doublet2_nn = knn_dist_rc < dist_th
    doublet2_score = doublet2_freq.sum(axis=1) / doublet2_nn.sum(axis=1)

    doublet_flag[np.logical_and(doublet_flag == 0,
                                doublet2_score >= doublet_th / 2)] = 2

    if graphs:

        if (use_pca):
            ds.ca.PCA = pca[0:ds.shape[1], :]
        else:
            ds.ca.HPF = theta[0:ds.shape[1], :]
        doublets_plots.plot_all(ds,
                                out_file=os.path.join(qc_dir + "/" + name +
                                                      "_doublets.png"),
                                labels=doublet_flag,
                                doublet_score_A=doublet_score_A,
                                logprob=logprob,
                                xx=xx,
                                score1=doublet_th1,
                                score2=doublet_th2,
                                score=doublet_th)

    logging.info(
        f"Doublet fraction: {100*len(np.where(doublet_flag>0)[0])/ds.shape[1]:.2f}%, {len(np.where(doublet_flag>0)[0])} cells. \n\t\t\t(Expected detectable doublet fraction: {(5e-4*ds.shape[1]):.2f}%)"
    )

    return doublet_score, doublet_flag
class BalancedKNN:
    """Greedy algorythm to balance a K-nearest neighbour graph

	It has an API similar to scikit-learn

	Parameters
	----------
	k : int  (default=50)
		the number of neighbours in the final graph
	sight_k : int  (default=100)
		the number of neighbours in the initialization graph
		It correspondent to the farthest neighbour that a sample is allowed to connect to
		when no closest neighbours are allowed. If sight_k is reached then the matrix is filled
		with the sample itself
	maxl : int  (default=200)
		max degree of connectivity allowed. Avoids the presence of hubs in the graph, it is the
		maximum number of neighbours that are allowed to contact a node before the node is blocked
	mode : str (default="connectivity")
		decide wich kind of utput "distance" or "connectivity"
	n_jobs : int  (default=4)
		parallelization of the standard KNN search preformed at initialization
	"""
    def __init__(self,
                 k: int = 50,
                 sight_k: int = 100,
                 maxl: int = 200,
                 mode: str = "distance",
                 metric: str = "euclidean",
                 minkowski_p: int = 20,
                 n_jobs: int = 4) -> None:
        self.k = k
        self.sight_k = sight_k
        self.maxl = maxl
        self.mode = mode
        self.metric = metric
        self.n_jobs = n_jobs
        self.dist_new = self.dsi_new = self.l = None  # type: np.ndarray
        self.bknn = None  # type: sparse.csr_matrix
        self.minkowski_p = minkowski_p

    @property
    def n_samples(self) -> int:
        return self.data.shape[0]

    def fit(self, data: np.ndarray, sight_k: int = None) -> Any:
        """Fits the model

		data: np.ndarray (samples, features)
			np
		sight_k: int
			the farthest point that a node is allowed to connect to when its closest neighbours are not allowed
		"""
        self.data = data
        self.fitdata = data
        if sight_k is not None:
            self.sight_k = sight_k
        logging.debug(
            f"First search the {self.sight_k} nearest neighbours for {self.n_samples}"
        )
        np.random.seed(13)
        if self.metric == "correlation":
            self.nn = NearestNeighbors(n_neighbors=self.sight_k + 1,
                                       metric=self.metric,
                                       p=self.minkowski_p,
                                       n_jobs=self.n_jobs,
                                       algorithm="brute")
            self.nn.fit(self.fitdata)
        elif self.metric == "js":
            # self.nn = cg.BallTreeJS(data=self.fitdata, leaf_size=10)
            self.nn = NNDescent(data=self.fitdata,
                                metric=jensen_shannon_distance)
        else:
            self.nn = NearestNeighbors(n_neighbors=self.sight_k + 1,
                                       metric=self.metric,
                                       p=self.minkowski_p,
                                       n_jobs=self.n_jobs,
                                       leaf_size=30)
            self.nn.fit(self.fitdata)
        return self

    def kneighbors(self,
                   X: np.ndarray = None,
                   maxl: int = None,
                   mode: str = "distance"
                   ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """Finds the K-neighbors of a point.

			Returns indices of and distances to the neighbors of each point.

			Parameters
			----------
			X : array-like, shape (n_query, n_features),
				The query point or points.
				If not provided, neighbors of each indexed point are returned.
				In this case, the query point is not considered its own neighbor.

			maxl: int
				max degree of connectivity allowed

			mode : "distance" or "connectivity"
				Decides the kind of output

			Returns
			-------
			dist_new : np.ndarray (samples, k+1)
				distances to the NN
			dsi_new : np.ndarray (samples, k+1)
				indexes of the NN, first column is the sample itself
			l: np.ndarray (samples)
				l[i] is the number of connections from other samples to the sample i

			NOTE:
			First column (0) correspond to the sample itself, the nearest nenigbour is at the second column (1)

		"""
        if X is not None:
            self.data = X
        if maxl is not None:
            self.maxl = maxl
        if mode == "distance":
            if self.metric == "js":
                self.dsi, self.dist = self.nn.query(self.data,
                                                    k=self.sight_k + 1)
            else:
                self.dist, self.dsi = self.nn.kneighbors(self.data,
                                                         return_distance=True)
        else:
            if self.metric == "js":
                self.dsi, _ = self.nn.query(self.data, k=self.sight_k + 1)
            else:
                self.dsi = self.nn.kneighbors(self.data, return_distance=False)
            self.dist = np.ones_like(self.dsi, dtype='float64')
            self.dist[:, 0] = 0
        logging.debug(
            f"Using the initialization network to find a {self.k}-NN graph with maximum connectivity of {self.maxl}"
        )
        self.dist_new, self.dsi_new, self.l = knn_balance(self.dsi,
                                                          self.dist,
                                                          maxl=self.maxl,
                                                          k=self.k)
        return self.dist_new, self.dsi_new, self.l

    def kneighbors_graph(self,
                         X: np.ndarray = None,
                         maxl: int = None,
                         mode: str = "distance") -> sparse.csr_matrix:
        """Retrun the K-neighbors graph as a sparse csr matrix

			Parameters
			----------
			X : array-like, shape (n_query, n_features),
				The query point or points.
				If not provided, neighbors of each indexed point are returned.
				In this case, the query point is not considered its own neighbor.

			maxl: int
				max degree of connectivity allowed

			mode : "distance" or "connectivity"
				Decides the kind of output

			Returns
			-------
			neighbor_graph : scipy.sparse.csr_matrix
				The values are either distances or connectivity dependig of the mode parameter

			NOTE: The diagonal will be zero even though the value 0 is actually stored

		"""
        dist_new, dsi_new, _ = self.kneighbors(X=X, maxl=maxl, mode=mode)
        logging.debug("Returning sparse matrix")
        self.bknn = sparse.csr_matrix(
            (np.ravel(dist_new), np.ravel(dsi_new),
             np.arange(0, dist_new.shape[0] * dist_new.shape[1] + 1,
                       dist_new.shape[1])), (self.n_samples, self.n_samples))
        return self.bknn

    def smooth_data(self,
                    data_to_smooth: np.ndarray,
                    X: np.ndarray = None,
                    maxl: int = None,
                    mutual: bool = False,
                    only_increase: bool = True) -> np.ndarray:
        """Use the wights learned from knn to smooth any data matrix

		Arguments
		---------
		data_to_smooth: (features, samples) !! NOTE !! this is different from the input (for speed issues)
			if the data is provided (samples, features), this will be detected and
			the correct operation performed at cost of some effciency
			In the case where samples == samples then the shape (features, samples) will be assumed
		
		"""
        if self.bknn is None:
            assert (X is None) and (
                maxl is
                None), "graph was already fit with different parameters"
            self.kneighbors_graph(X=X, maxl=maxl, mode=self.mode)
        if mutual:
            connectivity = make_mutual(self.bknn > 0)
        else:
            connectivity = self.bknn.T > 0
        connectivity = connectivity.tolil()
        connectivity.setdiag(1)
        w = connectivity_to_weights(connectivity).T
        assert np.allclose(
            w.sum(0), 1), "weight matrix need to sum to one over the columns"
        if data_to_smooth.shape[1] == w.shape[0]:
            result = sparse.csr_matrix.dot(data_to_smooth, w)
        elif data_to_smooth.shape[0] == w.shape[0]:
            result = sparse.csr_matrix.dot(data_to_smooth.T, w).T
        else:
            raise ValueError(
                f"Incorrect size of matrix, none of the axis correspond to the one of graph. {w.shape}"
            )

        if only_increase:
            return np.maximum(result, data_to_smooth)
        else:
            return result
Exemple #4
0
def find_edges(input, test, K):
    print(f"\tbuilding kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type in [1, 2]:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=20)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    elif kNN_type == 5:
        import nmslib
        M, efC = 30, 100
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0}
        
        space_names = ['l2_sparse', 'cosinesimil_sparse'] # https://github.com/nmslib/nmslib/blob/master/manual/spaces.md
        space_name = space_names[0]
        data_type = nmslib.DataType.SPARSE_VECTOR
        tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type)
        
        '''
        def calc_zero_rows(i):
            if input[i, :].getnnz() == 0:
                return 1
            else:
                return 0
        pool = Pool(num_threads)
        zero_row_num = sum(pool.map(calc_zero_rows, range(input.shape[0])))
        print(f"# zero rows in input = {zero_row_num}", end=" ")
        '''
        tree.addDataPointBatch(input)

        tree.createIndex(index_time_params, print_progress=True)
        # Setting query-time parameters
        efS = 100
        query_time_params = {'efSearch': efS}
        print('Setting query-time parameters', query_time_params, end=" ")
        tree.setQueryTimeParams(query_time_params)
    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("\tfinding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    elif kNN_type == 4:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    elif kNN_type == 5:
        '''
        def calc_zero_rows2(i):
            if test[i, :].getnnz() == 0:
                return 1
            else:
                return 0
        pool = Pool(num_threads)
        zero_row_num = sum(pool.map(calc_zero_rows2, range(test.shape[0])))
        print(f"# zero rows in test = {zero_row_num}")
        '''

        indices_ = tree.knnQueryBatch(test, k=K+1, num_threads=num_threads)
        indices = [i[0] for i in indices_]
        del indices_
    else:
        raise NotImplementedError

    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        assert len(per) == K+1, f"index1={index1} len(per)={len(per)} != K={K}"
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((index1, index2))
    print(f"\tget edges done! .... time={time.time()-st_time:.3f}s")
    return edge_list
Exemple #5
0
def find_edges(input, test, K, cluster_ids, query_ids):
    print(f"\tbuilding kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type in [1, 2]:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=20)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    elif kNN_type == 5:
        import nmslib
        M, efC, num_threads = 30, 100, 10
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0}
        space_name = 'cosinesimil_sparse'
        data_type = nmslib.DataType.SPARSE_VECTOR
        tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type)
        
        print(f"type(input) = {type(input)} type(test)={type(test)}", end=" ")
        
        tree.addDataPointBatch(input)

        tree.createIndex(index_time_params)
        # Setting query-time parameters
        efS = 100
        query_time_params = {'efSearch': efS}
        print('Setting query-time parameters', query_time_params)
        tree.setQueryTimeParams(query_time_params)

    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("\tfinding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    elif kNN_type == 4:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    elif kNN_type == 5:
        indices_ = tree.knnQueryBatch(test, k=K, num_threads=num_threads)
        indices = [i[0] for i in indices_]
        del indices_
    else:
        raise NotImplementedError

    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((query_ids[index1], center_ids[index2]))
    print(f"\tdone! .... time={time.time()-st_time:.3f}s")
    return edge_list