def find_edges(input, test, K): print(f"building kNN classifier ... ", end=" ") st_time = time.time() if kNN_type <= 3: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=10) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("finding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) else: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((index1, index2)) print(f"done! .... time={time.time()-st_time:.3f}s") return edge_list
def doublet_finder(ds: loompy.LoomConnection, use_pca: bool = False, proportion_artificial: float = 0.20, fixed_th: float = None, k: int = None, name: object = "tmp", qc_dir: object = ".", graphs: bool = True, max_th: float = 1) -> np.ndarray: # Step 1: Generate artificial doublets from input logging.debug("Creating artificial doublets") n_real_cells = ds.shape[1] n_doublets = int(n_real_cells / (1 - proportion_artificial) - n_real_cells) doublets = np.zeros((ds.shape[0], n_doublets)) for i in range(n_doublets): a = np.random.choice(ds.shape[1]) b = np.random.choice(ds.shape[1]) doublets[:, i] = ds[:, a] + ds[:, b] data_wdoublets = np.concatenate((ds[:, :], doublets), axis=1) logging.debug("Feature selection and dimensionality reduction") genes = FeatureSelectionByVariance(2000).fit(ds) if use_pca: # R function uses log2 counts/million f = np.divide(data_wdoublets.sum(axis=0), 10e6) norm_data = np.divide(data_wdoublets, f) norm_data = np.log(norm_data + 1) pca = PCA(n_components=50).fit_transform(norm_data[genes, :].T) else: data = sparse.coo_matrix(data_wdoublets[genes, :]).T hpf = HPF(k=64, validation_fraction=0.05, min_iter=10, max_iter=200, compute_X_ppv=False) hpf.fit(data) theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T if k is None: k = int(np.min([100, ds.shape[1] * 0.01])) logging.info(f"Initialize NN structure with k = {k}") if use_pca: knn_result = NearestNeighbors(n_neighbors=k, metric='euclidean', n_jobs=4) knn_result.fit(pca) knn_dist, knn_idx = knn_result.kneighbors(X=pca, return_distance=True) num = ds.shape[1] knn_result1 = NearestNeighbors(n_neighbors=k, metric='euclidean', n_jobs=4) knn_result1.fit(pca[0:num, :]) knn_dist1, knn_idx1 = knn_result1.kneighbors(X=pca[num + 1:, :], n_neighbors=10) knn_dist_rc, knn_idx_rc = knn_result1.kneighbors(X=pca[0:num, :], return_distance=True) else: knn_result = NNDescent(data=theta, metric=jensen_shannon_distance) knn_idx, knn_dist = knn_result.query(theta, k=k) num = ds.shape[1] knn_result1 = NNDescent(data=theta[0:num, :], metric=jensen_shannon_distance) knn_idx1, knn_dist1 = knn_result1.query(theta[num + 1:, :], k=10) knn_idx_rc, knn_dist_rc = knn_result1.query(theta[0:num, :], k=k) dist_th = np.mean(knn_dist1.flatten()) + 1.64 * np.std(knn_dist1.flatten()) doublet_freq = np.logical_and(knn_idx > ds.shape[1], knn_dist < dist_th) doublet_freq_A = doublet_freq[ds.shape[1]:ds.shape[1] + n_doublets, :] mean1 = doublet_freq_A.mean(axis=1) mean2 = doublet_freq_A[:, 0:int(np.ceil(k / 2))].mean(axis=1) doublet_score_A = np.maximum(mean1, mean2) doublet_freq = doublet_freq[0:ds.shape[1], :] mean1 = doublet_freq.mean(axis=1) mean2 = doublet_freq[:, 0:int(np.ceil(k / 2))].mean(axis=1) doublet_score = np.maximum(mean1, mean2) doublet_flag = np.zeros(ds.shape[1], int) doublet_th1 = 1 doublet_th2 = 1 doublet_th = 1 #Infer TH from the data or use fixed TH # instantiate and fit the KDE model kde = KernelDensity(bandwidth=0.1, kernel='gaussian') kde.fit(doublet_score_A[:, None]) # score_samples returns the log of the probability density xx = np.linspace(doublet_score_A.min(), doublet_score_A.max(), len(doublet_score_A)).reshape(-1, 1) logprob = kde.score_samples(xx) if fixed_th is not None: doublet_th = float(fixed_th) else: #Check if the distribution is bimodal intervals = UniDip(np.exp(logprob)).run() if (len(intervals) > 1): kmeans = KMeans(n_clusters=2).fit( doublet_score_A.reshape(len(doublet_score_A), 1)) high_cluster = np.where( kmeans.cluster_centers_ == max(kmeans.cluster_centers_))[0][0] doublet_th1 = np.around(np.min( doublet_score_A[kmeans.labels_ == high_cluster]), decimals=3) #0.5% for every 1000 cells - the rate of detectable doublets by 10X doublet_th2 = np.percentile(doublet_score, 100 - (5e-4 * ds.shape[1])) doublet_th2 = np.around(doublet_th2, decimals=3) #The TH shouldn't be higher than indicated if doublet_th2 > max_th: doublet_th2 = max_th if doublet_th1 > max_th: doublet_th1 = max_th if (len(np.where(doublet_score >= doublet_th1)[0]) > (len(np.where(doublet_score >= doublet_th2)[0]))): doublet_th = doublet_th2 else: doublet_th = doublet_th1 doublet_flag[doublet_score >= doublet_th] = 1 #Calculate the score for the cells that are nn of the marked doublets if use_pca: pca_rc = pca[0:num, :] knn_dist1_rc, knn_idx1_rc = knn_result1.kneighbors( X=pca_rc[doublet_flag == 1, :], n_neighbors=10, return_distance=True) else: theta_rc = theta[0:num, :] knn_idx1_rc, knn_dist1_rc = knn_result1.query( theta_rc[doublet_flag == 1, :], k=10) dist_th = np.mean( knn_dist1_rc.flatten()) + 1.64 * np.std(knn_dist1_rc.flatten()) doublet2_freq = np.logical_and(doublet_flag[knn_idx_rc] == 1, knn_dist_rc < dist_th) doublet2_nn = knn_dist_rc < dist_th doublet2_score = doublet2_freq.sum(axis=1) / doublet2_nn.sum(axis=1) doublet_flag[np.logical_and(doublet_flag == 0, doublet2_score >= doublet_th / 2)] = 2 if graphs: if (use_pca): ds.ca.PCA = pca[0:ds.shape[1], :] else: ds.ca.HPF = theta[0:ds.shape[1], :] doublets_plots.plot_all(ds, out_file=os.path.join(qc_dir + "/" + name + "_doublets.png"), labels=doublet_flag, doublet_score_A=doublet_score_A, logprob=logprob, xx=xx, score1=doublet_th1, score2=doublet_th2, score=doublet_th) logging.info( f"Doublet fraction: {100*len(np.where(doublet_flag>0)[0])/ds.shape[1]:.2f}%, {len(np.where(doublet_flag>0)[0])} cells. \n\t\t\t(Expected detectable doublet fraction: {(5e-4*ds.shape[1]):.2f}%)" ) return doublet_score, doublet_flag
class BalancedKNN: """Greedy algorythm to balance a K-nearest neighbour graph It has an API similar to scikit-learn Parameters ---------- k : int (default=50) the number of neighbours in the final graph sight_k : int (default=100) the number of neighbours in the initialization graph It correspondent to the farthest neighbour that a sample is allowed to connect to when no closest neighbours are allowed. If sight_k is reached then the matrix is filled with the sample itself maxl : int (default=200) max degree of connectivity allowed. Avoids the presence of hubs in the graph, it is the maximum number of neighbours that are allowed to contact a node before the node is blocked mode : str (default="connectivity") decide wich kind of utput "distance" or "connectivity" n_jobs : int (default=4) parallelization of the standard KNN search preformed at initialization """ def __init__(self, k: int = 50, sight_k: int = 100, maxl: int = 200, mode: str = "distance", metric: str = "euclidean", minkowski_p: int = 20, n_jobs: int = 4) -> None: self.k = k self.sight_k = sight_k self.maxl = maxl self.mode = mode self.metric = metric self.n_jobs = n_jobs self.dist_new = self.dsi_new = self.l = None # type: np.ndarray self.bknn = None # type: sparse.csr_matrix self.minkowski_p = minkowski_p @property def n_samples(self) -> int: return self.data.shape[0] def fit(self, data: np.ndarray, sight_k: int = None) -> Any: """Fits the model data: np.ndarray (samples, features) np sight_k: int the farthest point that a node is allowed to connect to when its closest neighbours are not allowed """ self.data = data self.fitdata = data if sight_k is not None: self.sight_k = sight_k logging.debug( f"First search the {self.sight_k} nearest neighbours for {self.n_samples}" ) np.random.seed(13) if self.metric == "correlation": self.nn = NearestNeighbors(n_neighbors=self.sight_k + 1, metric=self.metric, p=self.minkowski_p, n_jobs=self.n_jobs, algorithm="brute") self.nn.fit(self.fitdata) elif self.metric == "js": # self.nn = cg.BallTreeJS(data=self.fitdata, leaf_size=10) self.nn = NNDescent(data=self.fitdata, metric=jensen_shannon_distance) else: self.nn = NearestNeighbors(n_neighbors=self.sight_k + 1, metric=self.metric, p=self.minkowski_p, n_jobs=self.n_jobs, leaf_size=30) self.nn.fit(self.fitdata) return self def kneighbors(self, X: np.ndarray = None, maxl: int = None, mode: str = "distance" ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : array-like, shape (n_query, n_features), The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. maxl: int max degree of connectivity allowed mode : "distance" or "connectivity" Decides the kind of output Returns ------- dist_new : np.ndarray (samples, k+1) distances to the NN dsi_new : np.ndarray (samples, k+1) indexes of the NN, first column is the sample itself l: np.ndarray (samples) l[i] is the number of connections from other samples to the sample i NOTE: First column (0) correspond to the sample itself, the nearest nenigbour is at the second column (1) """ if X is not None: self.data = X if maxl is not None: self.maxl = maxl if mode == "distance": if self.metric == "js": self.dsi, self.dist = self.nn.query(self.data, k=self.sight_k + 1) else: self.dist, self.dsi = self.nn.kneighbors(self.data, return_distance=True) else: if self.metric == "js": self.dsi, _ = self.nn.query(self.data, k=self.sight_k + 1) else: self.dsi = self.nn.kneighbors(self.data, return_distance=False) self.dist = np.ones_like(self.dsi, dtype='float64') self.dist[:, 0] = 0 logging.debug( f"Using the initialization network to find a {self.k}-NN graph with maximum connectivity of {self.maxl}" ) self.dist_new, self.dsi_new, self.l = knn_balance(self.dsi, self.dist, maxl=self.maxl, k=self.k) return self.dist_new, self.dsi_new, self.l def kneighbors_graph(self, X: np.ndarray = None, maxl: int = None, mode: str = "distance") -> sparse.csr_matrix: """Retrun the K-neighbors graph as a sparse csr matrix Parameters ---------- X : array-like, shape (n_query, n_features), The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. maxl: int max degree of connectivity allowed mode : "distance" or "connectivity" Decides the kind of output Returns ------- neighbor_graph : scipy.sparse.csr_matrix The values are either distances or connectivity dependig of the mode parameter NOTE: The diagonal will be zero even though the value 0 is actually stored """ dist_new, dsi_new, _ = self.kneighbors(X=X, maxl=maxl, mode=mode) logging.debug("Returning sparse matrix") self.bknn = sparse.csr_matrix( (np.ravel(dist_new), np.ravel(dsi_new), np.arange(0, dist_new.shape[0] * dist_new.shape[1] + 1, dist_new.shape[1])), (self.n_samples, self.n_samples)) return self.bknn def smooth_data(self, data_to_smooth: np.ndarray, X: np.ndarray = None, maxl: int = None, mutual: bool = False, only_increase: bool = True) -> np.ndarray: """Use the wights learned from knn to smooth any data matrix Arguments --------- data_to_smooth: (features, samples) !! NOTE !! this is different from the input (for speed issues) if the data is provided (samples, features), this will be detected and the correct operation performed at cost of some effciency In the case where samples == samples then the shape (features, samples) will be assumed """ if self.bknn is None: assert (X is None) and ( maxl is None), "graph was already fit with different parameters" self.kneighbors_graph(X=X, maxl=maxl, mode=self.mode) if mutual: connectivity = make_mutual(self.bknn > 0) else: connectivity = self.bknn.T > 0 connectivity = connectivity.tolil() connectivity.setdiag(1) w = connectivity_to_weights(connectivity).T assert np.allclose( w.sum(0), 1), "weight matrix need to sum to one over the columns" if data_to_smooth.shape[1] == w.shape[0]: result = sparse.csr_matrix.dot(data_to_smooth, w) elif data_to_smooth.shape[0] == w.shape[0]: result = sparse.csr_matrix.dot(data_to_smooth.T, w).T else: raise ValueError( f"Incorrect size of matrix, none of the axis correspond to the one of graph. {w.shape}" ) if only_increase: return np.maximum(result, data_to_smooth) else: return result
def find_edges(input, test, K): print(f"\tbuilding kNN classifier ... ", end=" ") st_time = time.time() if kNN_type in [1, 2]: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=20) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) elif kNN_type == 5: import nmslib M, efC = 30, 100 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} space_names = ['l2_sparse', 'cosinesimil_sparse'] # https://github.com/nmslib/nmslib/blob/master/manual/spaces.md space_name = space_names[0] data_type = nmslib.DataType.SPARSE_VECTOR tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type) ''' def calc_zero_rows(i): if input[i, :].getnnz() == 0: return 1 else: return 0 pool = Pool(num_threads) zero_row_num = sum(pool.map(calc_zero_rows, range(input.shape[0]))) print(f"# zero rows in input = {zero_row_num}", end=" ") ''' tree.addDataPointBatch(input) tree.createIndex(index_time_params, print_progress=True) # Setting query-time parameters efS = 100 query_time_params = {'efSearch': efS} print('Setting query-time parameters', query_time_params, end=" ") tree.setQueryTimeParams(query_time_params) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("\tfinding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) elif kNN_type == 4: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) elif kNN_type == 5: ''' def calc_zero_rows2(i): if test[i, :].getnnz() == 0: return 1 else: return 0 pool = Pool(num_threads) zero_row_num = sum(pool.map(calc_zero_rows2, range(test.shape[0]))) print(f"# zero rows in test = {zero_row_num}") ''' indices_ = tree.knnQueryBatch(test, k=K+1, num_threads=num_threads) indices = [i[0] for i in indices_] del indices_ else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): assert len(per) == K+1, f"index1={index1} len(per)={len(per)} != K={K}" for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((index1, index2)) print(f"\tget edges done! .... time={time.time()-st_time:.3f}s") return edge_list
def find_edges(input, test, K, cluster_ids, query_ids): print(f"\tbuilding kNN classifier ... ", end=" ") st_time = time.time() if kNN_type in [1, 2]: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=20) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) elif kNN_type == 5: import nmslib M, efC, num_threads = 30, 100, 10 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} space_name = 'cosinesimil_sparse' data_type = nmslib.DataType.SPARSE_VECTOR tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type) print(f"type(input) = {type(input)} type(test)={type(test)}", end=" ") tree.addDataPointBatch(input) tree.createIndex(index_time_params) # Setting query-time parameters efS = 100 query_time_params = {'efSearch': efS} print('Setting query-time parameters', query_time_params) tree.setQueryTimeParams(query_time_params) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("\tfinding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) elif kNN_type == 4: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) elif kNN_type == 5: indices_ = tree.knnQueryBatch(test, k=K, num_threads=num_threads) indices = [i[0] for i in indices_] del indices_ else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((query_ids[index1], center_ids[index2])) print(f"\tdone! .... time={time.time()-st_time:.3f}s") return edge_list