def fitness(self): clusterer = HDBSCAN( algorithm=self.parametros["algorithm"], min_cluster_size=self.parametros["min_cluster_size"], min_samples=self.parametros["min_samples"], cluster_selection_method=self. parametros["cluster_selection_method"], cluster_selection_epsilon=self. parametros["cluster_selection_epsilon"]) clusterer.fit(self.data) self.labels = clusterer.labels_ silhouette_score = self.silhouette_score(self.data, self.labels) # balance = self.balance(clusterer.labels_) # percents = self.calc_percents(clusterer.labels_) # len_labels = self.len_labels(clusterer.labels_) # noise_percents = [item for item in percents if item[0] == -1][0][1] score = silhouette_score # print(percents) # print("\n") # print('\'' + str(json.dumps(self.parametros)) + '\'') # print("\n") # print(score) # print("---------------------\n\n") return score
def clustering(umap_embedding_fit, umap_embedding_predict, min_cluster_size, prediction_data): print("clustering...") hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=prediction_data).fit(umap_embedding_fit) clustering = hdbscan.fit_predict(umap_embedding_predict) labels = hdbscan.labels_ return clustering, labels
def test_hdbscan_allow_single_cluster_with_epsilon(): np.random.seed(0) no_structure = np.random.rand(150, 2) # without epsilon we should see many noise points as children of root. labels = HDBSCAN( min_cluster_size=5, cluster_selection_epsilon=0.0, cluster_selection_method="eom", allow_single_cluster=True, ).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 assert counts[unique_labels == -1] == 46 # for this random seed an epsilon of 0.2 will produce exactly 2 noise # points at that cut in single linkage. labels = HDBSCAN( min_cluster_size=5, cluster_selection_epsilon=0.2, cluster_selection_method="eom", allow_single_cluster=True, ).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert len(unique_labels) == 2 assert counts[unique_labels == -1] == 2
def hdbscan_samples(data, min_samples, n, filename): hdbscan = HDBSCAN(min_samples=min_samples, metric='haversine') data = data[np.random.randint(low=0, high=len(data), size=n), :] t0 = time.time() hdbscan.fit(np.radians(data)) t1 = time.time() - t0 clusters = len(np.unique(hdbscan.labels_)) project = os.path.realpath('.') csv = os.path.join(project, filename) if not os.path.exists(csv): with open(csv, mode='w') as timing: timing.write('min_samples,n,clusters,seconds\n') with open(csv, mode='a') as timing: timing.write('{},{},{},{}\n'.format(min_samples, n, clusters, t1)) print('HDBSCAN: {} samples, {} clusters, {} seconds'.format( n, clusters, t1)) return t1
def test_switch_to_leaf(): """ Verify that when we request more clusters than 'eom' can handle, method switches to 'leaf' and the results match 'leaf'. """ # Given the max number of clusters that can be produced by 'eom', # (these are produced for epsilon=0) (??? Needs verification) clusterer = HDBSCAN(cluster_selection_method='eom', cluster_selection_epsilon=0).fit(X) max_clusters = n_clusters_from_labels(clusterer.labels_) with warnings.catch_warnings(record=True) as w: # When we try flat clustering with 'eom' method for more n_clusters, clusterer_flat = HDBSCAN_flat(X, cluster_selection_method='eom', n_clusters=max_clusters + 2) # Then, a warning is raised saying 'eom' can't get this clustering, assert len(w) > 0 assert issubclass(w[-1].category, UserWarning) assert "Cannot predict" in str(w[-1].message) # the resulting clusterer switches to using method 'leaf', assert clusterer_flat.cluster_selection_method == 'leaf', ( "cluster selection method has not switched to 'leaf'") # and the resulting probabilities and labels must match epsilon = clusterer_flat.cluster_selection_epsilon clusterer_leaf = HDBSCAN(cluster_selection_method='leaf', cluster_selection_epsilon=epsilon).fit(X) assert_array_equal(clusterer_flat.labels_, clusterer_leaf.labels_) assert_array_equal(clusterer_flat.probabilities_, clusterer_leaf.probabilities_) return
def apply(self, fX): from hdbscan import HDBSCAN clusterer = HDBSCAN(min_cluster_size=self.min_cluster_size, min_samples=self.min_samples, metric='precomputed') distance_matrix = squareform(pdist(fX, metric=self.metric)) # apply clustering cluster_labels = clusterer.fit_predict(distance_matrix) # cluster embedding n_clusters = np.max(cluster_labels) + 1 if n_clusters < 2: return np.zeros(fX.shape[0], dtype=np.int) fC = l2_normalize( np.vstack([np.sum(fX[cluster_labels == k, :], axis=0) for k in range(n_clusters)])) # tag each undefined embedding to closest cluster undefined = cluster_labels == -1 closest_cluster = np.argmin( cdist(fC, fX[undefined, :], metric=self.metric), axis=0) cluster_labels[undefined] = closest_cluster return cluster_labels
def test_flat_base_default(): """ Verify that the default clustering of HDBSCAN is preserved. """ # Given, the base HDBSCAN with method 'eom' clusterer = HDBSCAN(cluster_selection_method='eom').fit(X) n_clusters = n_clusters_from_labels(clusterer.labels_) # When we ask for flat clustering with same n_clusters, clusterer_flat = HDBSCAN_flat(X, n_clusters=n_clusters, cluster_selection_method='eom') # Then, the labels and probabilities should match assert_array_equal(clusterer_flat.labels_, clusterer.labels_) assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_) # Given, the base HDBSCAN with method 'leaf' clusterer = HDBSCAN(cluster_selection_method='leaf').fit(X) n_clusters = n_clusters_from_labels(clusterer.labels_) # When we ask for flat clustering with same n_clusters, clusterer_flat = HDBSCAN_flat(X, n_clusters=n_clusters, cluster_selection_method='leaf') # Then, the labels and probabilities should match assert_array_equal(clusterer_flat.labels_, clusterer.labels_) assert_array_equal(clusterer_flat.probabilities_, clusterer.probabilities_) return
def hdbscan_clustering(self, min_cluster_size=10, min_cluster_portion=None, min_samples=1, metric='hamming', cluster_selection_method='eom', allow_single_cluster=True, epsilon=0.2): if min_cluster_portion is not None: if min_cluster_size is None: min_cluster_size = 0 min_cluster_size = max(min_cluster_size, self.n_obs * min_cluster_portion) else: if min_cluster_size is None: raise ValueError( 'Either min_cluster_size or min_cluster_portion should be provided' ) runner = HDBSCAN(min_cluster_size=int(min_cluster_size), min_samples=int(min_samples), metric=metric, cluster_selection_method=cluster_selection_method, allow_single_cluster=allow_single_cluster) if self.leiden_result_df is None: raise ValueError( 'Run multi_leiden_clustering first before hdbscan_clustering') runner.fit(self.leiden_result_df) self.hdbscan = runner self.reselect_clusters(epsilon=epsilon, min_cluster_size=min_cluster_size) return
def _cluster_train(df): start_time = time.time() # Note: Issue #88 open, prediction_data cannot be used with Haervsine metrics https://github.com/scikit-learn-contrib/hdbscan/issues/88 db = HDBSCAN(min_samples=1, metric='haversine', core_dist_n_jobs=-1, memory='./__pycache__/', prediction_data=True) coords = df[['latitude', 'longitude']] * np.pi / 180 df = df.assign(cluster=db.fit_predict(coords)) # get the number of clusters num_clusters = db.labels_.max() message = 'Clustered {:,} points down to {:,} clusters, for {:.1f}% compression in {:,.2f} seconds' print( message.format(len(df), num_clusters, 100 * (1 - float(num_clusters) / len(df)), time.time() - start_time)) # Get the list of the point most in the center of each clusters cluster_centers = df[[ 'cluster', 'latitude', 'longitude' ]].groupby('cluster')['latitude', 'longitude'].agg( lambda x: _get_centermost_point(x.values)) df = df.merge(cluster_centers, left_on='cluster', right_index=True, how='left', suffixes=('', '_cluster')) return db, cluster_centers, df
def run(self): if self.isStopped(): self.canceled.emit() return False options = self.options clusterer = HDBSCAN(min_cluster_size=options.min_cluster_size, min_samples=options.min_samples, cluster_selection_epsilon=options.cluster_selection_epsilon, cluster_selection_method=options.cluster_selection_method) layout_data = self._widget.get_layout_data() isolated_nodes = layout_data['isolated_nodes'] layout = layout_data['layout'] mask = np.ones_like(layout, dtype=bool) mask[isolated_nodes] = False x = layout[mask].reshape(-1, 2) clusterer.fit(x.astype(np.float64)) i = 0 result = [] for n in self._widget.scene().nodes(): if n.index() in isolated_nodes: result.append("Noise") else: result.append(f"Cluster {clusterer.labels_[i] + 1}" if clusterer.labels_[i] > 0 else "Noise") i += 1 if not self.isStopped(): return result else: self.canceled.emit()
def get_clusters(self, coordinates, original_df, coordinates_df, csv_path): """ It employs the HDBSCAN method to gather the supplied coordinates into clusters. Parameters ---------- coordinates : numpy.array The array of coordinates that will be clustered. Its shape must fulfill the following dimensions: [M, N, 3], where M is the total number of models that have been sampled with PELE and N is the total number of atoms belonging to the residue that is being analyzed original_df : pandas.DataFrame Original dataframe from Analysis to be overwritten coordinates_df : pandas.DataFrame The filtered dataframe which was used to extract coordinates for clustering csv_path : str Directory where the CSV will be saved Returns ------- clusters : numpy.array The array of cluster labels assigned to each conformer from the supplied array """ from hdbscan import HDBSCAN coordinates = Clustering.fix_coordinates_shape(coordinates) clustering_method = HDBSCAN(cluster_selection_epsilon=self._bandwidth) clusters = clustering_method.fit_predict(coordinates) self._save_cluster_info(original_df, coordinates_df, clusters, csv_path) return clusters
def cluster_data_points(data_points, cluster_size=5, distance_metric_func="Fractional"): points = [d['encoding'] for d in data_points] points = np.vstack(points) scaler = StandardScaler() scaler.fit(points) points = scaler.transform(points) dist_metric = Similarity() if distance_metric_func == "Fractional": dist_metric_func = dist_metric.fractional_distance else: dist_metric_func = dist_metric.euclidean_distance clusterer = HDBSCAN(min_cluster_size=cluster_size, metric='pyfunc', func=dist_metric_func) clusterer.fit(points) logging.info("Fit complete.") results = {} labelIDs = np.unique(clusterer.labels_) for labelID in labelIDs: paths = [] encodings = [] idxs = np.where(clusterer.labels_ == labelID)[0] for i in idxs: data = data_points[i] paths.append(data['path']) encodings.append(data['encoding']) results[labelID] = { 'paths': paths, 'mean_encoding': np.mean(np.asarray(encodings), axis=0), 'std_dev': np.std(encodings, axis=0), 'sample_size': len(paths) } return results
def hdbscan(X, min_cluster_size=5, gen_min_span_tree=True, **kwargs): """Clustering with Hierarchical DBSCAN. Parameters ---------- X : array-like n x k attribute data min_cluster_size : int, default: 5 the minimum number of points necessary to generate a cluster gen_min_span_tree : bool Description of parameter `gen_min_span_tree` (the default is True). kwargs Returns ------- fitted cluster instance: hdbscan.hdbscan.HDBSCAN """ try: from hdbscan import HDBSCAN except ImportError: raise ImportError( "You must have the hdbscan package installed to use this function") model = HDBSCAN(min_cluster_size=min_cluster_size) model.fit(X) return model
def HDBSCAN( self, parameters ): # data, min_cluster_size, min_samples, alpha, cluster_selection_method): result = {} default_min_cluster_size = 3 default_min_samples = 3 default_alpha = 0.5 #大于1的float default_cluster_selection_method = "eom" # "eom", "leaf" data = np.array(parameters['data']) data = preprocessing.MinMaxScaler().fit_transform(data) if parameters.get('min_cluster_size') is not None: default_min_cluster_size = int(parameters['min_cluster_size']) if parameters.get('min_samples') is not None: default_min_samples = int(parameters['min_samples']) if parameters.get('alpha') is not None: default_alpha = float(parameters['alpha']) if parameters.get('cluster_selection_method') is not None: default_cluster_selection_method = str( parameters['cluster_selection_method']) model = HDBSCAN( min_cluster_size=default_min_cluster_size, min_samples=default_min_samples, alpha=default_alpha, cluster_selection_method=default_cluster_selection_method, allow_single_cluster=True) clustering = model.fit(data) result['clustering'] = clustering return result
def clusterFromDistMatrix(distanceMatrix): clusterer = HDBSCAN(min_cluster_size=2, metric='precomputed') clusterer.fit(distanceMatrix) labels = clusterer.labels_ probs = clusterer.probabilities_ labels = np.array((labels))[np.newaxis] labels = labels.T probs = np.array((probs))[np.newaxis] probs = probs.T results = np.concatenate((probs, gv.medSequenceMatrix), axis=1) results = np.concatenate((labels, results), axis=1) results = np.array(sorted(results, key=lambda a_entry: a_entry[0])) with open('treatmentClusters.txt', 'w') as csvfile: csvfile.write( "Cluster; Probability; ID; Month 1; Month 2; Month 3; Month 4; Month 5; Month 6" ) csvfile.write('\n') for i in range(results.shape[0]): csvfile.write(str(results[i, 0])) csvfile.write(';') csvfile.write(str(results[i, 1])) csvfile.write(';') csvfile.write(str(results[i, 2])) csvfile.write(';') for j in range(3, results.shape[1]): csvfile.write( str(results[i, j]).replace('{', '').replace('}', '')) csvfile.write(';') csvfile.write('\n')
def TrainCluster(x): clusterer = HDBSCAN(min_cluster_size=1250, gen_min_span_tree=True, prediction_data=True) # creating a clustering object hdb = clusterer.fit(x) #Fitting cluster object on training data hdb.prediction_data del (x) return hdb
def cluster_hdbscan(above_gps): sample_by_feature = above_gps.to_numpy() print(sample_by_feature.shape) clusterer = HDBSCAN() clusterer.fit(sample_by_feature) cluster_ids = list(clusterer.labels_) return cluster_ids
def hdbscan_cluster(df: pd.DataFrame, min_cluster_size: int = 10, gen_min_span_tree: bool = True): clusterer = HDBSCAN(min_cluster_size=min_cluster_size, gen_min_span_tree=gen_min_span_tree) clusterer.fit(df) return clusterer.labels_, clusterer.probabilities_
def test_hdbscan_min_span_tree_availability(): clusterer = HDBSCAN().fit(X) tree = clusterer.minimum_spanning_tree_ assert tree is None D = distance.squareform(distance.pdist(X)) D /= np.max(D) HDBSCAN(metric='precomputed').fit(D) tree = clusterer.minimum_spanning_tree_ assert tree is None
def test_hdbscan_caching(): cachedir = mkdtemp() labels1 = HDBSCAN(memory=cachedir, min_samples=5).fit(X).labels_ labels2 = HDBSCAN(memory=cachedir, min_samples=5, min_cluster_size=6).fit(X).labels_ n_clusters1 = len(set(labels1)) - int(-1 in labels1) n_clusters2 = len(set(labels2)) - int(-1 in labels2) assert n_clusters1 == n_clusters2
def hdbscan_clustering(S,X,config): ''' Computes H-DBSCAN clustering from an input similarity matrix. Returns the labels associated with the clustering. ''' from hdbscan import HDBSCAN min_size = config.as_int("min_cluster_size") clf = HDBSCAN(min_cluster_size=min_size) return clf.fit_predict(X)
def hdbscan_clustering(S, X, config): ''' Computes H-DBSCAN clustering from an input similarity matrix. Returns the labels associated with the clustering. ''' from hdbscan import HDBSCAN min_size = config.as_int("min_cluster_size") clf = HDBSCAN(min_cluster_size=min_size) return clf.fit_predict(X)
def test_missing_data(): """Tests if nan data are treated as infinite distance from all other points and assigned to -1 cluster""" model = HDBSCAN().fit(X_missing_data) assert model.labels_[0] == -1 assert model.labels_[5] == -1 assert model.probabilities_[0] == 0 assert model.probabilities_[5] == 0 assert model.probabilities_[5] == 0 clean_indices = list(range(1, 5)) + list(range(6, 200)) clean_model = HDBSCAN().fit(X_missing_data[clean_indices]) assert np.allclose(clean_model.labels_, model.labels_[clean_indices])
def fit(self, data, min_cluster_size, min_samples, alpha, cluster_selection_method): data = np.array(data) data = preprocessing.MinMaxScaler().fit_transform(data) model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, alpha=alpha, cluster_selection_method=cluster_selection_method, allow_single_cluster=True) clustering = model.fit(data) return clustering
def test_hdbscan_centroids_medoids(): centers = [(0.0, 0.0), (3.0, 3.0)] H, y = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) clusterer = HDBSCAN().fit(H) for idx, center in enumerate(centers): centroid = clusterer.weighted_cluster_centroid(idx) assert_array_almost_equal(centroid, center, decimal=1) medoid = clusterer.weighted_cluster_medoid(idx) assert_array_almost_equal(medoid, center, decimal=1)
def hdbscan(self, args): start = time.time() model = HDBSCAN( min_cluster_size=args["min_cluster_size"], metric=args["metric"], leaf_size=args["leaf_size"], allow_single_cluster=args["allow_single_cluster"], ).fit(self.data_matrix) labels = model.predict(self.data_matrix) end = time.time() return labels, (end - start)
def hdbscan(self, min_cluster_size=10, prediction_data=False): """ DBSCAN but allows for varying density clusters and no longer requires epsilon parameter, which is difficult to tune. http://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html Scales slightly worse than DBSCAN, but with a more intuitive parameter. """ hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=prediction_data) if prediction_data: return hdbscan.fit(self._safe_dense(self.matrix)) else: return hdbscan.fit(self.matrix)
def _run_hdbscan(affinity: np.ndarray, min_cluster_size_for_hdbscan: int, min_cluster_size: int, max_cluster_size: int): assert affinity.shape[0] == affinity.shape[1] if affinity.shape[0] > max_cluster_size: allow_single_cluster = False else: allow_single_cluster = True db = HDBSCAN(metric='precomputed', min_cluster_size=min_cluster_size_for_hdbscan, min_samples=1, allow_single_cluster=allow_single_cluster) db.fit(affinity) return db
def cluster(df, min_size=4, allow_single_cluster=True): """Use HDBSCAN -- (Hierarchical Density-Based Spatial Clustering of Applications with Noise) to find the best clusters for the meander. """ clusterer = HDBSCAN(min_cluster_size=min_size, min_samples=3, metric='haversine', allow_single_cluster=allow_single_cluster) clusterer.fit(df[['lat', 'lng']]) df.loc[:, 'label'] = ['ABCDEFGHIJKLMN'[i] for i in clusterer.labels_] return df.sort_values('label').reset_index(drop=True)
def perform_hdbscan(self, min_cluster_size=15): hdbscan_clusterer = HDBSCAN(min_cluster_size, metric="precomputed") hdbscan_clusterer.fit(self.distance_matrix) self.hdbscan_results = { "parameters": hdbscan_clusterer.get_params(), "labels": hdbscan_clusterer.labels_, "probabilities": hdbscan_clusterer.probabilities_, "n_clusters": np.unique(hdbscan_clusterer.labels_).max() + 1, 'clusters': label_cnt_dict(hdbscan_clusterer.labels_) } print_dict(self.hdbscan_results)
class HDBSCAN: def __init__(self): self.cluster = HDBSCANBase(algorithm='best', approx_min_span_tree=True, gen_min_span_tree=False, leaf_size=40, metric='euclidean', min_cluster_size=15, min_samples=15, p=None) def fit(self, X, y=None): self.cluster.fit(X)
from vispy.visuals.transforms import STTransform phy.gui.create_app() mua = MUA(filename='S:/pcie.bin') spk = mua.tospk() fet = spk.tofet('pca') # spike sort a channel centered spiking events ch = 26 min_cluster_size = 5 leaf_size = 10 hdbcluster = HDBSCAN(min_cluster_size=min_cluster_size, leaf_size=leaf_size, gen_min_span_tree=True, algorithm='boruvka_kdtree') clu = hdbcluster.fit_predict(fet[ch]) print 'get clusters', np.unique(clu) # from phy.gui import GUI, create_app, run_app create_app() gui = GUI(position=(400, 200), size=(600, 400)) scatter_view = view_scatter_3d() scatter_view.attach(gui) scatter_view.set_data(fet[ch], clu)
def fit(self, X, y=None, sample_weight=None): """X is a dataframe.""" if self.method not in ("dbscan", "hdbscan", "spark"): raise ValueError("Unsupported method '%s'" % self.method) if not self.dbscan_params: self.dbscan_params = dict( min_samples=20, n_jobs=-1, algorithm='brute', metric=partial(distance_dataframe, X, **dict( junction_dist=StringDistance(), correct=False, tol=0))) if not self.hdbscan_params and self.method == 'hdbscan': self.hdbscan_params = dict( min_samples=20, n_jobs=-1, metric=partial(distance_dataframe, X, **dict( junction_dist=StringDistance(), correct=False, tol=0))) self.dbscan_params['eps'] = self.eps # new part: group by junction and v genes if self.method == 'hdbscan' and False: # no grouping; unsupported sample_weight groups_values = [[x] for x in np.arange(X.shape[0])] else: # list of lists groups_values = X.groupby( ["v_gene_set_str", self.model + "junc"]).groups.values() idxs = np.array([elem[0] for elem in groups_values]) # take one of them sample_weight = np.array([len(elem) for elem in groups_values]) X_all = idxs.reshape(-1, 1) if self.kmeans_params.get('n_clusters', True): # ensure the number of clusters is higher than points self.kmeans_params['n_clusters'] = min( self.kmeans_params['n_clusters'], X_all.shape[0]) kmeans = MiniBatchKMeans(**self.kmeans_params) lengths = X[self.model + 'junction_length'].values kmeans.fit(lengths[idxs].reshape(-1, 1)) dbscan_labels = np.zeros_like(kmeans.labels_).ravel() if self.method == 'hdbscan': from hdbscan import HDBSCAN from hdbscan.prediction import all_points_membership_vectors dbscan_sk = HDBSCAN(**self.hdbscan_params) else: dbscan_sk = DBSCAN(**self.dbscan_params) if self.method == 'spark': from pyspark import SparkContext from icing.externals.pypardis import dbscan as dbpard sc = SparkContext.getOrCreate() sample_weight_map = dict(zip(idxs, sample_weight)) # self.dbscan_params.pop('n_jobs', None) dbscan = dbpard.DBSCAN( dbscan_params=self.dbscan_params, **self.dbspark_params) # else: for i, label in enumerate(np.unique(kmeans.labels_)): idx_row = np.where(kmeans.labels_ == label)[0] if self.verbose: print("Iteration %d/%d" % (i, np.unique(kmeans.labels_).size), "(%d seqs)" % idx_row.size, end='\r') X_idx = idxs[idx_row].reshape(-1, 1).astype('float64') weights = sample_weight[idx_row] if idx_row.size == 1: db_labels = np.array([0]) elif self.method == 'spark' and idx_row.size > 5000: test_data = sc.parallelize(enumerate(X_idx)) dbscan.train(test_data, sample_weight=sample_weight_map) db_labels = np.array(dbscan.assignments())[:, 1] elif self.method == 'hdbscan': db_labels = dbscan_sk.fit_predict(X_idx) # unsupported weights # avoid noise samples soft_clusters = all_points_membership_vectors(dbscan_sk) db_labels = np.array([np.argmax(x) for x in soft_clusters]) else: db_labels = dbscan_sk.fit_predict( X_idx, sample_weight=weights) if len(dbscan_sk.core_sample_indices_) < 1: db_labels[:] = 0 if -1 in db_labels: balltree = BallTree( X_idx[dbscan_sk.core_sample_indices_], metric=dbscan_sk.metric) noise_labels = balltree.query( X_idx[db_labels == -1], k=1, return_distance=False).ravel() # get labels for core points, then assign to noise points based # on balltree dbscan_noise_labels = db_labels[ dbscan_sk.core_sample_indices_][noise_labels] db_labels[db_labels == -1] = dbscan_noise_labels # hopefully, there are no noisy samples at this time db_labels[db_labels > -1] = db_labels[db_labels > -1] + np.max(dbscan_labels) + 1 dbscan_labels[idx_row] = db_labels # + np.max(dbscan_labels) + 1 if self.method == 'spark': sc.stop() labels = dbscan_labels # new part: put together the labels labels_ext = np.zeros(X.shape[0], dtype=int) labels_ext[idxs] = labels for i, list_ in enumerate(groups_values): labels_ext[list_] = labels[i] self.labels_ = labels_ext