def find_clusters( X: xpd.DataFrame, eps: float = 3000, min_samples: int = 250, output_colname: str = "cluster_id", ) -> xpd.Series: """ Classify a point cloud into several groups, with each group being assigned a positive integer label like 1, 2, 3, etc. Unclassified noise points are labelled as NaN. Uses Density-based spatial clustering of applications with noise (DBSCAN). See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering *** ** 111 NN ** ** * 11 22 N * **** --> 1 2222 ** ** 33 22 ****** 333333 Parameters ---------- X : cudf.DataFrame or pandas.DataFrame A table of X, Y, Z points to run the clustering algorithm on. eps : float The maximum distance between 2 points such they reside in the same neighborhood. Default is 3000 (metres). min_samples : int The number of samples in a neighborhood such that this group can be considered as an important core point (including the point itself). Default is 250 (sample points). output_colname : str The name of the column for the output Series. Default is 'cluster_id'. Returns ------- cluster_labels : cudf.Series or pd.Series Which cluster each datapoint belongs to. Noisy samples are labeled as NaN. """ try: from cuml.cluster import DBSCAN except ImportError: from sklearn.cluster import DBSCAN # Run DBSCAN using {eps} m distance, and minimum of {min_samples} points dbscan = DBSCAN(eps=eps, min_samples=min_samples) dbscan.fit(X=X) cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0 if isinstance(cluster_labels, np.ndarray): cluster_labels = xpd.Series(data=cluster_labels, dtype=xpd.Int32Dtype()) cluster_labels = cluster_labels.mask( cond=cluster_labels == 0) # turn 0 to NaN cluster_labels.index = X.index # let labels have same index as input data cluster_labels.name = output_colname return cluster_labels
def dbscan(model, counts_per_word, embeddings=None, sim_thresh=0.8, min_samples=5, min_occs=1000, verbose=False): if embeddings is None: #print('COUNTS PER WORD:', counts_per_word[:, 1]) # Keep only hashtags with more than min_occs occurences nb_to_keep = np.argmax(counts_per_word[:, 1].astype(int) < min_occs) if nb_to_keep == 0: raise Exception( f'dbscan : No word with more than {min_occs} occurences') # Create fit data #model_words = set(model.wv.vocab.keys()) model_words = set(model.wv.index_to_key) words_kept = np.array([ word for word, count in counts_per_word[:nb_to_keep] if word in model_words ]) X = np.array([model.wv[w] for w in words_kept]) else: X = embeddings #print('X2 :', X) words_kept = np.arange(len(X)).astype(str) # cosine DBScan clustering = DBSCAN(eps=1 - sim_thresh, min_samples=min_samples, metric='cosine').fit(X) clust_labels = clustering.labels_ # Setup and fit clusters dbscan_float = DBSCAN(eps=1 - sim_thresh, min_samples=min_samples) dbscan_float.fit(gdf_float) if verbose: print(np.bincount(clust_labels + 1)[1:]) for e in range(clust_labels.max() + 1): print(f"Topic {e} :") tags = np.array(counts_per_word)[:len(clust_labels)][clust_labels == e] for tag in tags: print(f"\t{tag}") return clust_labels, words_kept
def test_cuml_fit_clusters(): # Create and populate a GPU DataFrame gdf_float = cudf.DataFrame() gdf_float['0'] = [1.0, 2.0, 5.0] gdf_float['1'] = [4.0, 2.0, 1.0] gdf_float['2'] = [4.0, 2.0, 1.0] # Setup and fit clusters dbscan_float = DBSCAN(eps=1.0, min_samples=1) dbscan_float.fit(gdf_float) print(dbscan_float.labels_)
def test_dbscan(self): import cudf from cuml.cluster import DBSCAN # Create and populate a GPU DataFrame gdf_float = cudf.DataFrame() gdf_float['0'] = [1.0, 2.0, 5.0] gdf_float['1'] = [4.0, 2.0, 1.0] gdf_float['2'] = [4.0, 2.0, 1.0] # Setup and fit clusters dbscan_float = DBSCAN(eps=1.0, min_samples=1) dbscan_float.fit(gdf_float) self.assertEqual(3, dbscan_float.labels_.size)
def dbscan_gpu(feature, max_neighbor_dist): """ :param feature: of shape num_samples x num_features :param max_neighbor_dist: The maximum distance between two samples for one to be considered as in the neighborhood of the other :return: """ from cupy import asnumpy from cuml.cluster import DBSCAN db = DBSCAN(eps=max_neighbor_dist, min_samples=1).fit(feature) return asnumpy(db.labels_.values)
class T5_DBSCAN: def __init__(self, ndims=5, nn=25, eps=0.1, minSamples=25, coreSamples=True, verbose=False): self.logger = logging.getLogger("T5Clustering") logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.DEBUG) self.reducer = DimReducer(n_components=ndims, n_neighbors=nn) gpu_mem(0) self.train_data = None self.test_data = None self.data = None self.clusterer = DBSCAN(eps=eps, min_samples=minSamples, verbose=verbose, calc_core_sample_indices=coreSamples, output_type='cudf') def split(self, fname, sampling=True, train_size=1000000, test_size=None): self.train_data, self.test_data = read(fname, sampling=sampling, train_size=train_size, test_size=test_size) return self def reduce(self, train_data): self.logger.info("Dimensionality reduction (UMAP): %s", self.reducer.reducer.get_params) self.data = self.reducer.fit_transform(train_data) return self def cluster(self): self.logger.info("Clustering ... (DBSCAN)") gpu_mem(0) self.clusterer.fit(self.data) self.logger.info("Clusters: %d, Core samples: %d", self.clusterer.labels_.max(), len(self.clusterer.core_sample_indices_))
def __init__(self, ndims=5, nn=25, eps=0.1, minSamples=25, coreSamples=True, verbose=False): self.logger = logging.getLogger("T5Clustering") logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.DEBUG) self.reducer = DimReducer(n_components=ndims, n_neighbors=nn) gpu_mem(0) self.train_data = None self.test_data = None self.data = None self.clusterer = DBSCAN(eps=eps, min_samples=minSamples, verbose=verbose, calc_core_sample_indices=coreSamples, output_type='cudf')
def _find_clusters_dbscan(points, eps, min_samples): # ball_tree algorithm appears to run about 30-40% faster based on a single # test orbit and (vx, vy), run on a laptop, improving from 300ms to 180ms. # # Runtime is not very sensitive to leaf_size, but 30 appears to be roughly # optimal, and is the default value anyway. db = DBSCAN( eps=eps, min_samples=min_samples, algorithm="ball_tree", leaf_size=30, ) db.fit(points) cluster_labels = np.unique(db.labels_[np.where(db.labels_ != -1)]) clusters = [] for label in cluster_labels: cluster_indices = np.where(db.labels_ == label)[0] clusters.append(cluster_indices) del db return clusters
def clusterVelocity( obs_ids, x, y, dt, vx, vy, eps=0.005, min_samples=5, min_arc_length=1.0, ): """ Clusters THOR projection with different velocities in the projection plane using `~scipy.cluster.DBSCAN`. Parameters ---------- obs_ids : `~numpy.ndarray' (N) Observation IDs. x : `~numpy.ndarray' (N) Projection space x coordinate in degrees or radians. y : `~numpy.ndarray' (N) Projection space y coordinate in degrees or radians. dt : `~numpy.ndarray' (N) Change in time from 0th exposure in units of MJD. vx : `~numpy.ndarray' (N) Projection space x velocity in units of degrees or radians per day in MJD. vy : `~numpy.ndarray' (N) Projection space y velocity in units of degrees or radians per day in MJD. eps : float, optional The maximum distance between two samples for them to be considered as in the same neighborhood. See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html [Default = 0.005] min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html [Default = 5] min_arc_length : float, optional Minimum arc length in units of days for a cluster to be accepted. Returns ------- list If clusters are found, will return a list of numpy arrays containing the observation IDs for each cluster. If no clusters are found, will return np.NaN. """ xx = x - vx * dt yy = y - vy * dt if USE_GPU: kwargs = {} else: kwargs = {"n_jobs": 1} X = np.vstack([xx, yy]).T db = DBSCAN(eps=eps, min_samples=min_samples, **kwargs) db.fit(X) clusters = db.labels_[np.where(db.labels_ != -1)[0]] cluster_ids = [] logger.debug( f"cluster: vx={vx} vy={vy} n_obs={len(obs_ids)} n_cluster={len(cluster_ids)}", ) if len(clusters) != 0: for cluster in np.unique(clusters): cluster_mask = np.where(db.labels_ == cluster)[0] dt_in_cluster = dt[cluster_mask] num_obs = len(dt_in_cluster) arc_length = dt_in_cluster.max() - dt_in_cluster.min() if (num_obs == len(np.unique(dt_in_cluster))) and ( num_obs >= min_samples) and (arc_length >= min_arc_length): cluster_ids.append(obs_ids[cluster_mask]) if len(cluster_ids) == 0: cluster_ids = np.NaN del db return cluster_ids
train_data, test_data = read( "/mnt/m2-1/cw09b.dh.1k.spam.70.dochits.tier.5.t5-base.embeddings.0.npz", sampling=True, train_size=1000000, test_size=None) # reduce dimensions ndims = 5 nn = 25 logger.info("Dimensionality reduction: 768 to %d... (UMAP)", nn) reducer = DimReducer(n_components=ndims, n_neighbors=nn) data = reducer.fit_transform(train_data) # cluster eps = 0.05 minSamples = 25 logger.info("Clustering ... (DBSCAN)") dbscan = DBSCAN( eps=eps, min_samples=minSamples, verbose=False, # max_mbytes_per_batch=4096, calc_core_sample_indices=True, output_type='cudf') dbscan.fit(data) gpu_mem(0) print(dbscan.labels_.max()) # print(len(dbscan.core_sample_indices_)) # print(dbscan.core_sample_indices_)
def dbscan_gpu(model, counts_per_word, embeddings=None, sim_thresh=0.8, min_samples=5, min_occs=1000, verbose=False, s2v=False): if embeddings is None: #print('COUNTS PER WORD:', counts_per_word[:, 1]) # Keep only hashtags with more than min_occs occurences nb_to_keep = np.argmax(counts_per_word[:, 1].astype(int) < min_occs) if nb_to_keep == 0: raise Exception( f'dbscan : No word with more than {min_occs} occurences') else: pass #print(f'dbscan : Keepings {nb_to_keep} words with more than {min_occs} occurences') # Create fit data #model_words = set(model.wv.vocab.keys()) if not s2v: model_words = set(model.wv.index_to_key) else: model_words = set(model.keys()) words_kept = np.array([ word for word, count in counts_per_word[:nb_to_keep] if word in model_words ]) #print('1- len(words_kept) :', len(words_kept)) X = cudf.DataFrame() if s2v: transposed = np.array([model[w] for w in words_kept]).transpose() else: transposed = np.array([model.wv[w] for w in words_kept]).transpose() for e, v in enumerate(transposed): X[e] = v X = pairwise_distances(X, metric='cosine') else: X = cudf.DataFrame() for e, v in enumerate(embeddings.transpose()): X[e] = v X = pairwise_distances(X, metric='cosine') words_kept = np.arange(len(embeddings)).astype(str) #print('2- len(words_kept) :', len(words_kept)) # cosine DBScan #clustering = DBSCAN(eps=1-sim_thresh, min_samples=min_samples, metric='cosine').fit(X) #clust_labels = clustering.labels_ # Setup and fit clusters # Create and populate a GPU DataFrame #print('len(X):', len(X)) clustering = DBSCAN(eps=1 - sim_thresh, min_samples=min_samples, metric="precomputed").fit(X) clust_labels = clustering.labels_.to_array() #print('labels :', clust_labels) #.to_pandas().values #print('len(clust_labels) :', len(clust_labels)) if verbose: print(np.bincount(clust_labels + 1)[1:]) for e in range(clust_labels.max() + 1): print(f"Topic {e} :") tags = np.array(counts_per_word)[:len(clust_labels)][clust_labels == e] for tag in tags: print(f"\t{tag}") return clust_labels, words_kept
def make_clusters_DBSCAN(dat_to_cluster, eps): estimator = DBSCAN(eps= eps, min_samples=3) res = estimator.fit_predict(dat_to_cluster) return res, estimator