def fit(self, X, y=None, sample_weight=None): X = list(filter(lambda x:self.w2v_model.wv.__contains__(x[0]), X)) self.tags, self.reviews_num = zip(*X) self.X = np.array([self.w2v_model.wv.__getitem__(t) for t in self.tags]) X = self.X # X = self._validate_data(X, accept_sparse='csr') if not self.eps > 0.0: raise ValueError("eps must be positive.") if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) # Calculate neighborhood for all samples. This leaves the original # point in, which needs to be considered later (i.e. point i is in the # neighborhood of point i. While True, its useless information) if self.metric == 'precomputed' and sparse.issparse(X): # set the diagonal to explicit values, as a point is its own # neighbor with warnings.catch_warnings(): warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning) X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place neighbors_model = NearestNeighbors( radius=self.eps, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False) if sample_weight is None: n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) else: n_neighbors = np.array([np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]) # Initially, all samples are noise. labels = np.full(X.shape[0], -1, dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) self.core_sample_indices_ = np.where(core_samples)[0] self.labels_ = labels if len(self.core_sample_indices_): # fix for scipy sparse indexing issue self.components_ = X[self.core_sample_indices_].copy() else: # no core samples self.components_ = np.empty((0, X.shape[1])) return self
def fit(self, dataFrame: pd.DataFrame, spatialFeatures: list, temporalFeatures: list): ''' ''' if (len(spatialFeatures) <= 0 or len(temporalFeatures) <= 0): raise ValueError( "spatialFeatures and temporalFeatures must be lists with length greater than 0" ) nnEps1 = NearestNeighbors(radius=self.eps1, algorithm="auto", leaf_size=30, metric=self.metric1, metric_params=self.metric1Params, p=None, n_jobs=None) nnEps2 = NearestNeighbors(radius=self.eps2, algorithm="auto", leaf_size=30, metric=self.metric2, metric_params=self.metric2Params, p=None, n_jobs=None) spatialDf = dataFrame[spatialFeatures] temporalDf = dataFrame[temporalFeatures] nnEps1.fit(spatialDf) nnEps2.fit(temporalDf) eps1Neighborhoods = nnEps1.radius_neighbors(spatialDf, return_distance=False) eps2Neighborhoods = nnEps2.radius_neighbors(temporalDf, return_distance=False) # Intersection of 2 neighborhoods neighborhoods = [] for eps1Neighbors, eps2Neighbors in zip(eps1Neighborhoods, eps2Neighborhoods): intersection = np.intersect1d(eps1Neighbors, eps2Neighbors) neighborhoods.append(intersection) neighborhoods = np.array(neighborhoods, dtype=object) # All samples are noise in the beginning labels = np.full(dataFrame.shape[0], -1, dtype=np.intp) # neighborsCounts = np.array( [len(neighborhood) for neighborhood in neighborhoods]) corePoints = np.asarray(neighborsCounts >= self.minSamples, dtype=np.uint8) dbscan_inner(corePoints, neighborhoods, labels) self.labels = labels self.neighborhoods = neighborhoods self.corePoints = corePoints
def dbscan(X, eps=0.5, minpts=5, metric='minkowski', algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=1): X = check_array(X, accept_sparse='csr') if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) if metric == 'precomputed' and sparse.issparse(X): neighborhoods = np.empty(X.shape[0], dtype=object) X.sum_duplicates() # XXX: modifies X's internals in-place X_mask = X.data <= eps masked_indices = astype(X.indices, np.intp, copy=False)[X_mask] masked_indptr = np.cumsum(X_mask)[X.indptr[1:] - 1] # insert the diagonal: a point is its own neighbor, but 0 distance # means absence from sparse matrix data masked_indices = np.insert(masked_indices, masked_indptr, np.arange(X.shape[0])) masked_indptr = masked_indptr[:-1] + np.arange(1, X.shape[0]) # split into rows neighborhoods[:] = np.split(masked_indices, masked_indptr) else: neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, n_jobs=n_jobs) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors(X, eps, return_distance=False) if sample_weight is None: n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) else: n_neighbors = np.array( [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]) # Initially, all samples are noise. labels = -np.ones(X.shape[0], dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= minpts, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) return np.where(core_samples)[0], labels
def variable_eps_DBSCAN(X, eps_array, min_samples=5): """ Density-Based Spatial Clustering of Applications with Noise Parameters ---------- X : array[float, float], shape=(n_samples,n_features) Similarity matrix eps_array : array[float], shape=(n_samples) The maximum distance between two points for them to be considered to be in the same neighborhood, applied locally. Returns -------- cluster_centers : array, shape=[n_clusters, n_features] Coordinates of cluster centers. labels : array, shape=[n_samples] Cluster labels for each point. Notes ----- Code adapted from scikit-learn library """ # Calculate neighborhood for all samples. This leaves the original point # in, which needs to be considered later (i.e. point i is in the # neighborhood of point i. While True, its useless information) neighborhoods = np.array([np.where(x <= eps_array[i])[0] for i, x in enumerate(X)]) n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) # Initially, all samples are noise. labels = -np.ones(X.shape[0], dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) return np.where(core_samples)[0], labels
def cluster(distances_filename: str, metadata_filename: str): """ DBSCAN clustering of the embeddings based on a pairwise distance matrix. Parameters ---------- distances_filename : str Precomputed pairwise distance matrix file to use for the DBSCAN clustering. metadata_filename : str Metadata file with precursor m/z information for all embeddings. """ clusters_filename = (distances_filename.replace('dist_', 'clusters_').replace( '.npz', '.npy')) if os.path.isfile(clusters_filename): return # DBSCAN clustering of the embeddings. logger.info( 'DBSCAN clustering (eps=%.4f, min_samples=%d) of precomputed ' 'pairwise distance matrix %s', config.eps, config.min_samples, distances_filename) # Reimplement DBSCAN preprocessing to avoid unnecessary memory consumption. dist = ss.load_npz(distances_filename) dist_data, dist_indices, dist_indptr = dist.data, dist.indices, dist.indptr num_embeddings = dist.shape[0] # Find the eps-neighborhoods for all points. logger.debug('Find the eps-neighborhoods for all points (eps=%.4f)', config.eps) mask = dist_data <= config.eps # noinspection PyTypeChecker indptr = _cumsum(mask)[dist_indptr] indices = dist_indices[mask].astype(np.intp, copy=False) neighborhoods = np.split(indices, indptr[1:-1]) # Initially, all samples are noise. # (Memmap for shared memory multiprocessing.) cluster_labels = np.lib.format.open_memmap(clusters_filename, mode='w+', dtype=np.intp, shape=(num_embeddings, )) cluster_labels.fill(-1) # A list of all core samples found. n_neighbors = np.fromiter(map(len, neighborhoods), np.uint32) core_samples = n_neighbors >= config.min_samples # Run Scikit-Learn DBSCAN. logger.debug('Run Scikit-Learn DBSCAN inner.') neighborhoods_arr = np.empty(len(neighborhoods), dtype=np.object) neighborhoods_arr[:] = neighborhoods dbscan_inner(core_samples, neighborhoods_arr, cluster_labels) # Free up memory by deleting DBSCAN-related data structures. del dist, dist_data, dist_indices, dist_indptr, mask, indptr, indices del neighborhoods, n_neighbors, core_samples, neighborhoods_arr gc.collect() # Refine initial clusters to make sure spectra within a cluster don't have # an excessive precursor m/z difference. precursor_mzs = (pd.read_parquet( metadata_filename, columns=['mz']).squeeze().values.astype(np.float32)) logger.debug('Sort cluster labels in ascending order.') order = np.argsort(cluster_labels) reverse_order = np.argsort(order) cluster_labels[:] = cluster_labels[order] precursor_mzs = precursor_mzs[order] logger.debug( 'Finetune %d initial cluster assignments to not exceed %d %s ' 'precursor m/z tolerance', cluster_labels[-1] + 1, config.precursor_tol_mass, config.precursor_tol_mode) if cluster_labels[-1] == -1: # Only noise samples. cluster_labels.fill(-1) else: group_idx = nb.typed.List(_get_cluster_group_idx(cluster_labels)) n_clusters = nb.typed.List( joblib.Parallel(n_jobs=-1)(joblib.delayed(_postprocess_cluster)( cluster_labels[start_i:stop_i], precursor_mzs[start_i:stop_i], config.precursor_tol_mass, config.precursor_tol_mode, config.min_samples) for start_i, stop_i in group_idx)) _assign_unique_cluster_labels(cluster_labels, group_idx, n_clusters, config.min_samples) cluster_labels[:] = cluster_labels[reverse_order] cluster_labels.flush() logger.debug('%d unique clusters after precursor m/z finetuning', np.amax(cluster_labels) + 1)
def dbscan( X, Core=[], eps=0.5, min_samples=5, metric='minkowski', algorithm='kd_tree', leaf_size=30, p=2, sample_weight=None, n_jobs=1, ): """Perform DBSCAN clustering from vector array or distance matrix. Read more in the :ref:`User Guide <dbscan>`. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. eps : float, optional The maximum distance between two samples for them to be considered as in the same neighborhood. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors for DBSCAN. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. See NearestNeighbors module documentation for details. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : float, optional The power of the Minkowski metric to be used to calculate distance between points. sample_weight : array, shape (n_samples,), optional Weight of each sample, such that a sample with a weight of at least ``min_samples`` is by itself a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. n_jobs : int, optional (default = 1) The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Returns ------- core_samples : array [n_core_samples] Indices of core samples. labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. Notes ----- See examples/cluster/plot_dbscan.py for an example. This implementation bulk-computes all neighborhood queries, which increases the memory complexity to O(n.d) where d is the average number of neighbors, while original DBSCAN had memory complexity O(n). Sparse neighborhoods can be precomputed using :func:`NearestNeighbors.radius_neighbors_graph <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with ``mode='distance'``. References ---------- Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise". In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 """ if not eps > 0.0: raise ValueError("eps must be positive.") X = check_array(X, accept_sparse='csr') if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) # print ("sample_weight") # print(sample_weight) neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, n_jobs=n_jobs) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors(X, eps, return_distance=False) # print ("neighborhoods") # print(neighborhoods) if sample_weight is None: n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) #print('n_neighbors') #print (n_neighbors) else: # print("sample_weight[neighborhoods[1]]") # print (sample_weight[neighborhoods[1]]) n_neighbors = np.array( [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]) # Initially, all samples are noise. labels = -np.ones(X.shape[0], dtype=np.intp) #print(labels) # A list of all core samples found. core_samples = np.zeros(X.shape[0], dtype=np.uint8) if len(Core) != 0: # print("core_samples:\n") NCore = expand_core(Core, neighborhoods, min_samples) for i in NCore: core_samples[i] = 1 # for j in neighborhoods[i]: # core_samples[j] = 1 else: core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8) #print(labels) print('1111111111111') dbscan_inner(core_samples, neighborhoods, labels) # print("core_where:\n") # # print(np.where(core_samples)[0]) return np.where(core_samples)[0], labels
def dbscan(coords, timestamps, eps_d=500, eps_t=10 / 1.66667e-11, min_samples=5, metric_d='l1', metric_t='l1', algorithm='auto', leaf_size=30, n_jobs=1): neighbors_model_O = NearestNeighbors(radius=eps_d, leaf_size=leaf_size, metric=metric_d, n_jobs=n_jobs, algorithm=algorithm) neighbors_model_O.fit(coords[:, :2]) neighborhoods_O = neighbors_model_O.radius_neighbors(coords[:, :2], eps_d, return_distance=False) neighbors_model_D = NearestNeighbors(radius=eps_d, leaf_size=leaf_size, metric=metric_d, n_jobs=n_jobs, algorithm=algorithm) neighbors_model_D.fit(coords[:, 2:]) neighborhoods_D = neighbors_model_D.radius_neighbors(coords[:, 2:], eps_d, return_distance=False) neighbors_model_t_O = NearestNeighbors(radius=eps_t, leaf_size=leaf_size, metric=metric_t, n_jobs=n_jobs, algorithm=algorithm) neighbors_model_t_O.fit(timestamps[:, [0]]) neighborhoods_t_O = neighbors_model_t_O.radius_neighbors( timestamps[:, [0]], eps_t, return_distance=False) neighbors_model_t_D = NearestNeighbors(radius=eps_t, leaf_size=leaf_size, metric=metric_t, n_jobs=n_jobs, algorithm=algorithm) neighbors_model_t_D.fit(timestamps[:, [1]]) neighborhoods_t_D = neighbors_model_t_D.radius_neighbors( timestamps[:, [1]], eps_t, return_distance=False) n_neighbors = np.zeros(coords.shape[0], dtype=np.int16) neighborhoods = np.empty(coords.shape[0], dtype=object) for i in range(coords.shape[0]): neighbor_i = np.array( list( set(neighborhoods_O[i]).intersection( set(neighborhoods_D[i]), set(neighborhoods_t_O[i]), set(neighborhoods_t_D[i])))) neighborhoods[i] = neighbor_i n_neighbors[i] = neighbor_i.shape[0] # Initially, all samples are noise. labels = -np.ones(coords.shape[0], dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) return np.where(core_samples)[0], labels
def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=1): """Perform DBSCAN clustering from vector array or distance matrix. Read more in the :ref:`User Guide <dbscan>`. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. eps : float, optional The maximum distance between two samples for them to be considered as in the same neighborhood. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by :func:`sklearn.metrics.pairwise_distances` for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors for DBSCAN. metric_params : dict, optional Additional keyword arguments for the metric function. .. versionadded:: 0.19 algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. See NearestNeighbors module documentation for details. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : float, optional The power of the Minkowski metric to be used to calculate distance between points. sample_weight : array, shape (n_samples,), optional Weight of each sample, such that a sample with a weight of at least ``min_samples`` is by itself a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. n_jobs : int, optional (default = 1) The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Returns ------- core_samples : array [n_core_samples] Indices of core samples. labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. Notes ----- For an example, see :ref:`examples/cluster/plot_dbscan.py <sphx_glr_auto_examples_cluster_plot_dbscan.py>`. This implementation bulk-computes all neighborhood queries, which increases the memory complexity to O(n.d) where d is the average number of neighbors, while original DBSCAN had memory complexity O(n). Sparse neighborhoods can be precomputed using :func:`NearestNeighbors.radius_neighbors_graph <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with ``mode='distance'``. References ---------- Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise". In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 """ if not eps > 0.0: raise ValueError("eps must be positive.") X = check_array(X, accept_sparse='csr') if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) # Calculate neighborhood for all samples. This leaves the original point # in, which needs to be considered later (i.e. point i is in the # neighborhood of point i. While True, its useless information) if metric == 'precomputed' and sparse.issparse(X): neighborhoods = np.empty(X.shape[0], dtype=object) X.sum_duplicates() # XXX: modifies X's internals in-place X_mask = X.data <= eps masked_indices = X.indices.astype(np.intp, copy=False)[X_mask] masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))[X.indptr[1:]] # insert the diagonal: a point is its own neighbor, but 0 distance # means absence from sparse matrix data masked_indices = np.insert(masked_indices, masked_indptr, np.arange(X.shape[0])) masked_indptr = masked_indptr[:-1] + np.arange(1, X.shape[0]) # split into rows neighborhoods[:] = np.split(masked_indices, masked_indptr) else: neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm, leaf_size=leaf_size, metric=metric, metric_params=metric_params, p=p, n_jobs=n_jobs) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors(X, eps, return_distance=False) if sample_weight is None: n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) else: n_neighbors = np.array([np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]) # Initially, all samples are noise. labels = -np.ones(X.shape[0], dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) return np.where(core_samples)[0], labels
def dbscan(self, X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=None): """Perform DBSCAN clustering from vector array or distance matrix. Read more in the :ref:`User Guide <dbscan>`. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. eps : float, optional The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by :func:`sklearn.metrics.pairwise_distances` for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors for DBSCAN. metric_params : dict, optional Additional keyword arguments for the metric function. .. versionadded:: 0.19 algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. See NearestNeighbors module documentation for details. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : float, optional The power of the Minkowski metric to be used to calculate distance between points. sample_weight : array, shape (n_samples,), optional Weight of each sample, such that a sample with a weight of at least ``min_samples`` is by itself a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. n_jobs : int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. Returns ------- core_samples : array [n_core_samples] Indices of core samples. labels : array [n_samples] Cluster labels for each point. """ if not eps > 0.0: raise ValueError("eps must be positive.") X = check_array(X, accept_sparse='csr') if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) # Calculate neighborhood for all samples. This leaves the original point # in, which needs to be considered later (i.e. point i is in the # neighborhood of point i. While True, its useless information) if metric == 'precomputed' and sparse.issparse(X): neighborhoods = np.empty(X.shape[0], dtype=object) X.sum_duplicates() # XXX: modifies X's internals in-place # set the diagonal to explicit values, as a point is its own neighbor with warnings.catch_warnings(): warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning) X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place X_mask = X.data <= eps masked_indices = X.indices.astype(np.intp, copy=False)[X_mask] masked_indptr = np.concatenate(([0], np.cumsum(X_mask))) masked_indptr = masked_indptr[X.indptr[1:-1]] # split into rows neighborhoods[:] = np.split(masked_indices, masked_indptr) else: neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm, leaf_size=leaf_size, metric=metric, metric_params=metric_params, p=p, n_jobs=n_jobs) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors( X, eps, return_distance=False) if sample_weight is None: n_neighbors = np.array( [len(neighbors) for neighbors in neighborhoods]) else: n_neighbors = np.array([ np.sum(sample_weight[neighbors]) for neighbors in neighborhoods ]) # Initially, all samples are noise. labels = np.full(X.shape[0], -1, dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) return np.where(core_samples)[0], labels
def generate_clusters(pairwise_dist_matrix: ss.csr_matrix, eps: float, min_samples: int, precursor_mzs: np.ndarray, precursor_tol_mass: float, precursor_tol_mode: str) \ -> np.ndarray: """ DBSCAN clustering of the given pairwise distance matrix. Parameters ---------- pairwise_dist_matrix : ss.csr_matrix A sparse pairwise distance matrix used for clustering. eps : float The maximum distance between two samples for one to be considered as in the neighborhood of the other. min_samples : int The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. precursor_mzs : np.ndarray Precursor m/z's matching the pairwise distance matrix. precursor_tol_mass : float Maximum precursor mass tolerance for points to be clustered together. precursor_tol_mode : str The unit of the precursor m/z tolerance ('Da' or 'ppm'). Returns ------- np.ndarray Cluster labels. Noisy samples are given the label -1. """ # DBSCAN clustering using the precomputed pairwise distance matrix. logger.info('DBSCAN clustering (eps=%.4f, min_samples=%d) of precomputed ' 'pairwise distance matrix', eps, min_samples) # Reimplement DBSCAN preprocessing to avoid unnecessary memory consumption. # Find the eps-neighborhoods for all points. mask = pairwise_dist_matrix.data <= eps indices = pairwise_dist_matrix.indices[mask].astype(np.intp) indptr = np.zeros(len(mask) + 1, dtype=np.int64) np.cumsum(mask, out=indptr[1:]) indptr = indptr[pairwise_dist_matrix.indptr] neighborhoods = np.split(indices, indptr[1:-1]) # Initially, all samples are noise. clusters = np.full(pairwise_dist_matrix.shape[0], -1, dtype=np.intp) # A list of all core samples found. n_neighbors = np.fromiter(map(len, neighborhoods), np.uint32) core_samples = n_neighbors >= min_samples # Run Scikit-Learn DBSCAN. neighborhoods_arr = np.empty(len(neighborhoods), dtype=np.object) neighborhoods_arr[:] = neighborhoods dbscan_inner(core_samples, neighborhoods_arr, clusters) # Refine initial clusters to make sure spectra within a cluster don't have # an excessive precursor m/z difference. order = np.argsort(clusters) reverse_order = np.argsort(order) clusters, precursor_mzs = clusters[order], precursor_mzs[order] logger.debug('Finetune %d initial unique clusters to not exceed %.2f %s ' 'precursor m/z tolerance', clusters[-1] + 1, precursor_tol_mass, precursor_tol_mode) group_idx = _get_cluster_group_idx(clusters) if len(group_idx) == 0: # Only noise samples. return -np.ones_like(precursor_mzs, dtype=np.int64) cluster_reassignments = nb.typed.List(joblib.Parallel(n_jobs=-1)( joblib.delayed(_postprocess_cluster) (precursor_mzs[start_i:stop_i], precursor_tol_mass, precursor_tol_mode) for start_i, stop_i in group_idx)) clusters = _assign_unique_cluster_labels( group_idx, cluster_reassignments, min_samples)[reverse_order] logger.debug('%d unique clusters after precursor m/z finetuning', np.amax(clusters) + 1) return clusters
def faiss_dbscan(X, eps=0.5, min_samples=5, nlist=100, nprobe=5, metric='l2', metric_params=None, algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=1, GPU=False, IVFFlat=True): """Perform DBSCAN clustering from vector array or distance matrix. Read more in the :ref:`User Guide <dbscan>`. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. eps : float, optional The maximum distance between two samples for them to be considered as in the same neighborhood. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors for DBSCAN. metric_params : dict, optional Additional keyword arguments for the metric function. .. versionadded:: 0.19 algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. See NearestNeighbors module documentation for details. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : float, optional The power of the Minkowski metric to be used to calculate distance between points. sample_weight : array, shape (n_samples,), optional Weight of each sample, such that a sample with a weight of at least ``min_samples`` is by itself a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. n_jobs : int, optional (default = 1) The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Returns ------- core_samples : array [n_core_samples] Indices of core samples. labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. Notes ----- See examples/cluster/plot_dbscan.py for an example. This implementation bulk-computes all neighborhood queries, which increases the memory complexity to O(n.d) where d is the average number of neighbors, while original DBSCAN had memory complexity O(n). Sparse neighborhoods can be precomputed using :func:`NearestNeighbors.radius_neighbors_graph <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with ``mode='distance'``. References ---------- Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise". In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 """ if not eps > 0.0: raise ValueError("eps must be positive.") if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) # Calculate neighborhood for all samples. This leaves the original point # in, which needs to be considered later (i.e. point i is in the # neighborhood of point i. While True, its useless information) if GPU is True: neighborhoods = gpu_radius_neighbors(X, eps, min_samples, nlist, nprobe, return_distance=False, IVFFlat=IVFFlat) else: neighborhoods = cpu_radius_neighbors(X, eps, min_samples, nlist, nprobe, return_distance=False, IVFFlat=IVFFlat) if sample_weight is None: n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) else: n_neighbors = np.array([np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]) # Initially, all samples are noise. labels = -np.ones(X.shape[0], dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) return np.where(core_samples)[0], labels
def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=None): """Perform DBSCAN clustering from vector array or distance matrix. Read more in the :ref:`User Guide <dbscan>`. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. eps : float, optional The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by :func:`sklearn.metrics.pairwise_distances` for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors for DBSCAN. metric_params : dict, optional Additional keyword arguments for the metric function. .. versionadded:: 0.19 algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. See NearestNeighbors module documentation for details. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : float, optional The power of the Minkowski metric to be used to calculate distance between points. sample_weight : array, shape (n_samples,), optional Weight of each sample, such that a sample with a weight of at least ``min_samples`` is by itself a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. n_jobs : int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. Returns ------- core_samples : array [n_core_samples] Indices of core samples. labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. See also -------- DBSCAN An estimator interface for this clustering algorithm. OPTICS A similar estimator interface clustering at multiple values of eps. Our implementation is optimized for memory usage. Notes ----- For an example, see :ref:`examples/cluster/plot_dbscan.py <sphx_glr_auto_examples_cluster_plot_dbscan.py>`. This implementation bulk-computes all neighborhood queries, which increases the memory complexity to O(n.d) where d is the average number of neighbors, while original DBSCAN had memory complexity O(n). It may attract a higher memory complexity when querying these nearest neighborhoods, depending on the ``algorithm``. One way to avoid the query complexity is to pre-compute sparse neighborhoods in chunks using :func:`NearestNeighbors.radius_neighbors_graph <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with ``mode='distance'``, then using ``metric='precomputed'`` here. Another way to reduce memory and computation time is to remove (near-)duplicate points and use ``sample_weight`` instead. :func:`cluster.optics <sklearn.cluster.optics>` provides a similar clustering with lower memory usage. References ---------- Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise". In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). DBSCAN revisited, revisited: why and how you should (still) use DBSCAN. ACM Transactions on Database Systems (TODS), 42(3), 19. """ if not eps > 0.0: raise ValueError("eps must be positive.") X = check_array(X, accept_sparse='csr') if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) # Calculate neighborhood for all samples. This leaves the original point # in, which needs to be considered later (i.e. point i is in the # neighborhood of point i. While True, its useless information) if metric == 'precomputed' and sparse.issparse(X): neighborhoods = np.empty(X.shape[0], dtype=object) X.sum_duplicates() # XXX: modifies X's internals in-place # set the diagonal to explicit values, as a point is its own neighbor with warnings.catch_warnings(): warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning) X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place X_mask = X.data <= eps masked_indices = X.indices.astype(np.intp, copy=False)[X_mask] masked_indptr = np.concatenate(([0], np.cumsum(X_mask))) masked_indptr = masked_indptr[X.indptr[1:-1]] # split into rows neighborhoods[:] = np.split(masked_indices, masked_indptr) else: neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm, leaf_size=leaf_size, metric=metric, metric_params=metric_params, p=p, n_jobs=n_jobs) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors(X, eps, return_distance=False) #np.savetxt('sklearn_neighborhoods', neighborhoods, fmt='%s') if sample_weight is None: n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) else: n_neighbors = np.array( [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]) # Initially, all samples are noise. labels = np.full(X.shape[0], -1, dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) return np.where(core_samples)[0], labels