def test_hdbscan_no_clusters(): labels, p, persist, ctree, ltree, mtree = hdbscan(X, min_cluster_size=len(X) + 1) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, 0) labels = HDBSCAN(min_cluster_size=len(X) + 1).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, 0)
def test_hdbscan_boruvka_balltree_matches(): data = generate_noisy_data() labels_prims, p, persist, ctree, ltree, mtree = hdbscan( data, algorithm='generic') labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan( data, algorithm='boruvka_balltree') num_mismatches = homogeneity(labels_prims, labels_boruvka) assert_less(num_mismatches / float(data.shape[0]), 0.15) labels_prims = HDBSCAN(algorithm='generic').fit_predict(data) labels_boruvka = HDBSCAN(algorithm='boruvka_balltree').fit_predict(data) num_mismatches = homogeneity(labels_prims, labels_boruvka) assert_less(num_mismatches / float(data.shape[0]), 0.15)
def test_hdbscan_best_balltree_metric(): labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric='seuclidean', V=np.ones(X.shape[1])) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(metric='seuclidean', V=np.ones(X.shape[1])).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_condensed_tree_plot(): clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) if_matplotlib(clusterer.condensed_tree_.plot)( select_clusters=True, label_clusters=True, selection_palette=('r', 'g', 'b'), cmap='Reds') if_matplotlib(clusterer.condensed_tree_.plot)(log_size=True, colorbar=False, cmap='none')
def test_tree_output_formats(): clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) clusterer.condensed_tree_.to_pandas() clusterer.condensed_tree_.to_networkx() clusterer.single_linkage_tree_.to_pandas() clusterer.single_linkage_tree_.to_networkx() clusterer.single_linkage_tree_.to_numpy() clusterer.minimum_spanning_tree_.to_pandas() clusterer.minimum_spanning_tree_.to_networkx()
def test_min_span_tree_plot(): clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap='Reds') H, y = make_blobs(n_samples=50, random_state=0, n_features=10) H = StandardScaler().fit_transform(H) clusterer = HDBSCAN(gen_min_span_tree=True).fit(H) if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap='Reds', vary_line_width=False, colorbar=False) H, y = make_blobs(n_samples=50, random_state=0, n_features=40) H = StandardScaler().fit_transform(H) clusterer = HDBSCAN(gen_min_span_tree=True).fit(H) if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap='Reds', vary_line_width=False, colorbar=False)
def test_hdbscan_approximate_predict_score(): clusterer = HDBSCAN(min_cluster_size=200).fit(X) # no prediction data error assert_raises(ValueError, approximate_predict_scores, clusterer, X) clusterer.generate_prediction_data() # wrong dimensions error assert_raises(ValueError, approximate_predict_scores, clusterer, np.array([[1, 2, 3]])) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") approximate_predict_scores(clusterer, np.array([[1.5, -1.0]])) # no clusters warning assert "Clusterer does not have any defined clusters" in str( w[-1].message) clusterer = HDBSCAN(prediction_data=True).fit(X) scores = approximate_predict_scores(clusterer, X) assert_array_almost_equal(scores, clusterer.outlier_scores_) assert scores.min() >= 0 assert scores.max() <= 1
def test_hdbscan_boruvka_kdtree_matches(): data = generate_noisy_data() labels_prims, p, persist, ctree, ltree, mtree = hdbscan( data, algorithm="generic") labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan( data, algorithm="boruvka_kdtree") num_mismatches = homogeneity(labels_prims, labels_boruvka) assert (num_mismatches / float(data.shape[0])) < 0.15 labels_prims = HDBSCAN(algorithm="generic").fit_predict(data) labels_boruvka = HDBSCAN(algorithm="boruvka_kdtree").fit_predict(data) num_mismatches = homogeneity(labels_prims, labels_boruvka) assert (num_mismatches / float(data.shape[0])) < 0.15
def hdbscan_clustering(S, X, config): ''' Computes H-DBSCAN clustering from an input similarity matrix. Returns the labels associated with the clustering. ''' from hdbscan import HDBSCAN min_size = config.as_int("min_cluster_size") clf = HDBSCAN(min_cluster_size=min_size) return clf.fit_predict(X)
def get_hotspots(**kwargs): '''Return the stable clusters from the condensed tree of connected components from the density graph''' print(' * HDBSCAN clustering data with ' + str(multiprocessing.cpu_count()) + ' cores...') config = { 'core_dist_n_jobs': multiprocessing.cpu_count(), 'min_cluster_size': kwargs['min_cluster_size'], 'cluster_selection_epsilon': 0.01, 'min_samples': 1, 'approx_min_span_tree': False, } v = kwargs['vecs'] z = HDBSCAN(**config).fit(v) # find the points in each cluster d = defaultdict(list) for idx, i in enumerate(z.labels_): d[i].append(v[idx]) # find the convex hull for each cluster's points convex_hulls = [] for i in d: hull = ConvexHull(d[i]) points = [hull.points[j] for j in hull.vertices] # the last convex hull simplex needs to connect back to the first point convex_hulls.append(np.vstack([points, points[0]])) # find the centroids for each cluster centroids = [] for i in d: x, y = np.array(d[i]).T centroids.append(np.array([np.mean(x), np.mean(y)])) # identify the number of points in each cluster lens = [len(d[i]) for i in d] # combine data into cluster objects closest, _ = pairwise_distances_argmin_min(centroids, v) paths = [kwargs['image_paths'][i] for i in closest] clusters = [{ 'img': clean_filename(paths[idx]), 'convex_hull': convex_hulls[idx].tolist(), 'n_images': lens[idx], } for idx, i in enumerate(closest)] # remove massive clusters retained = [] for idx, i in enumerate(clusters): x, y = np.array(i['convex_hull']).T area = 0.5 * np.abs( np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1))) if area < 0.2: retained.append(i) # sort the clusers by size clusters = sorted(retained, key=lambda i: i['n_images'], reverse=True) for idx, i in enumerate(clusters): i['label'] = 'Cluster {}'.format(idx + 1) # save the hotspots to disk and return the path to the saved json print(' * found', len(clusters), 'hotspots') return write_json(get_path('hotspots', 'hotspot', **kwargs), clusters, **kwargs)
def hdbscan(self, args): start = time.time() model = HDBSCAN( min_cluster_size=args["min_cluster_size"], metric=args["metric"], leaf_size=args["leaf_size"], allow_single_cluster=args["allow_single_cluster"], ).fit(self.data_matrix) labels = model.predict(self.data_matrix) end = time.time() return labels, (end - start)
def test_hdbscan_membership_vector(): clusterer = HDBSCAN(prediction_data=True).fit(X) vector = membership_vector(clusterer, np.array([[-1.5, -1.0]])) assert_array_almost_equal(vector, np.array([[0.05705305, 0.05974177, 0.12228153]])) vector = membership_vector(clusterer, np.array([[1.5, -1.0]])) assert_array_almost_equal(vector, np.array([[0.09462176, 0.32061556, 0.10112905]])) vector = membership_vector(clusterer, np.array([[0.0, 0.0]])) assert_array_almost_equal(vector, np.array([[0.03545607, 0.03363318, 0.04643177]]))
def hdbscan_cluster(params): print("LOAD CORPUS START") content = read_file(params.train_file) train_feature = pickle.load(open(params.feature_file, 'rb')) indices = list(range(len(train_feature))) import random SEED = 42 random.seed(SEED) random.shuffle(indices) if params.sample_number >= len(train_feature): print("sample_number:" + str(len(train_feature))) indices = indices[:params.sample_number] content = np.array(content)[indices].tolist() train_feature = np.array(train_feature)[indices].tolist() from hdbscan import HDBSCAN print("HDBSCAN STARTING....") hdb = HDBSCAN(min_samples=1).fit(train_feature) data_dir, _ = os.path.split(params.train_file) pickle.dump( hdb, open( os.path.join(data_dir, params.method + str(params.sample_number) + '.obj'), 'wb')) sample_labels = hdb.labels_ n_clusters_hdb_ = len( set(sample_labels)) - (1 if -1 in sample_labels else 0) print('\n\n++ HDBSCAN Results') print('Estimated number of clusters: %d' % n_clusters_hdb_) with open(params.train_file + params.method + str(params.sample_number), 'w') as file_w: for idx, line in enumerate(content): file_w.write(str(sample_labels[idx]) + '\t' + line + '\n') #可能需要降维,聚类可视化 #import matplotlib.pyplot as plt #hdb_unique_labels = set(hdb_labels) #hdb_colors = plt.cm.Spectral(np.linspace(0, 1, len(hdb_unique_labels))) #fig = plt.figure(figsize=plt.figaspect(0.5)) #hdb_axis = fig.add_subplot('111') #for k, col in zip(hdb_unique_labels, hdb_colors): # if k == -1: # # Black used for noise. # col = 'k' # hdb_axis.plot(X[hdb_labels == k, 0], X[hdb_labels == k, 1], 'o', markerfacecolor=col, # markeredgecolor='k', markersize=6) #hdb_axis.set_title('HDBSCAN\nEstimated number of clusters: %d' % n_clusters_hdb_) pass
def test_hdbscan_centroids_medoids(): centers = [(0.0, 0.0), (3.0, 3.0)] H, y = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5) clusterer = HDBSCAN().fit(H) for idx, center in enumerate(centers): centroid = clusterer.weighted_cluster_centroid(idx) assert_array_almost_equal(centroid, center, decimal=1) medoid = clusterer.weighted_cluster_medoid(idx) assert_array_almost_equal(medoid, center, decimal=1)
def test_hdbscan_callable_metric(): # metric is the function reference, not the string key. metric = distance.euclidean labels = hdbscan(X, metric=metric) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(metric=metric).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_condensed_tree_plot(): clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) if_matplotlib(clusterer.condensed_tree_.plot)( select_clusters=True, label_clusters=True, selection_palette=("r", "g", "b"), cmap="Reds", ) if_matplotlib(clusterer.condensed_tree_.plot)(log_size=True, colorbar=False, cmap="none")
def fit(self, data, min_cluster_size, min_samples, alpha, cluster_selection_method): data = np.array(data) data = preprocessing.MinMaxScaler().fit_transform(data) model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, alpha=alpha, cluster_selection_method=cluster_selection_method, allow_single_cluster=True) clustering = model.fit(data) return clustering
def test_hdbscan_callable_metric(): # metric is the function reference, not the string key. metric = distance.euclidean labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric=metric) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters labels = HDBSCAN(metric=metric).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters
def test_hdbscan_feature_vector(): labels, p, persist, ctree, ltree, mtree = hdbscan(X) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters labels = HDBSCAN().fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters validity = validity_index(X, labels) assert validity >= 0.4
def test_hdbscan_feature_vector(): labels, p, persist, ctree, ltree, mtree = hdbscan(X) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN().fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters) validity = validity_index(X, labels) assert_greater_equal(validity, 0.4)
def run_hdbscan(X, COLS, MCS, MS, DF, y): """ Run hdbscan for given data and combination of Min Cluster Size, Min Samples Parameters ---------- X: pandas.DataFrame The input data to be clustered COLS: list The columns from X to be considered MCS: int Min Cluster Size, input to HDBSCAN() MS: int Min Samples, input to HDBSCAN() DF: pandas.DataFrame Data for profiling Returns ------- Dict with the profile and labels """ hdb = \ (HDBSCAN( min_cluster_size=MCS, min_samples=MS) .fit(X[COLS]) ) df = \ (DF .join(y) .assign(clus = hdb.labels_) .query("clus != -1") ) profile = \ (df .groupby('clus') .mean() .join(Series(hdb.labels_, name='size').value_counts(normalize=True)) .T .round(2) .loc[COLS + ['size', 'y'], :] ) return { 'profile': profile.loc[['size', 'y']].T.sort_values('y').query("y > 0.2"), 'labels': hdb.labels_ }
def test_hdbscan_min_cluster_size(): for min_cluster_size in range(2, len(X) + 1, 1): labels, p, persist, ctree, ltree, mtree = hdbscan( X, min_cluster_size=min_cluster_size) true_labels = [label for label in labels if label != -1] if len(true_labels) != 0: assert np.min(np.bincount(true_labels)) >= min_cluster_size labels = HDBSCAN(min_cluster_size=min_cluster_size).fit(X).labels_ true_labels = [label for label in labels if label != -1] if len(true_labels) != 0: assert np.min(np.bincount(true_labels)) >= min_cluster_size
def cluster(df, min_size=4, allow_single_cluster=True): """Use HDBSCAN -- (Hierarchical Density-Based Spatial Clustering of Applications with Noise) to find the best clusters for the meander. """ clusterer = HDBSCAN(min_cluster_size=min_size, min_samples=3, metric='haversine', allow_single_cluster=allow_single_cluster) clusterer.fit(df[['lat', 'lng']]) df.loc[:, 'label'] = ['ABCDEFGHIJKLMN'[i] for i in clusterer.labels_] return df.sort_values('label').reset_index(drop=True)
def perform_hdbscan(self, min_cluster_size=15): hdbscan_clusterer = HDBSCAN(min_cluster_size, metric="precomputed") hdbscan_clusterer.fit(self.distance_matrix) self.hdbscan_results = { "parameters": hdbscan_clusterer.get_params(), "labels": hdbscan_clusterer.labels_, "probabilities": hdbscan_clusterer.probabilities_, "n_clusters": np.unique(hdbscan_clusterer.labels_).max() + 1, 'clusters': label_cnt_dict(hdbscan_clusterer.labels_) } print_dict(self.hdbscan_results)
def clusterMessageTypesHDBSCAN(self, min_cluster_size = 10, min_samples = 2) \ -> Tuple[Dict[int, List[Tuple[MessageSegment]]], numpy.ndarray, HDBSCAN]: clusterer = HDBSCAN(metric='precomputed', allow_single_cluster=True, cluster_selection_method='leaf', min_cluster_size=min_cluster_size, min_samples=min_samples) print("Messages: HDBSCAN min cluster size:", min_cluster_size, "min samples:", min_samples) segmentClusters, labels = self._postprocessClustering(clusterer) return segmentClusters, labels, clusterer
def test_hdbscan_distance_matrix(): D = distance.squareform(distance.pdist(X)) D /= np.max(D) labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric='precomputed') # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(metric="precomputed").fit(D).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_hdbscan_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) # H, y = shuffle(X, y, random_state=7) H = StandardScaler().fit_transform(H) labels, p, persist, ctree, ltree, mtree = hdbscan(H) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(algorithm='best', metric='seuclidean', V=np.ones(H.shape[1])).fit(H).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def _run_hdbscan(affinity: np.ndarray, min_cluster_size_for_hdbscan: int, min_cluster_size: int, max_cluster_size: int): assert affinity.shape[0] == affinity.shape[1] if affinity.shape[0] > max_cluster_size: allow_single_cluster = False else: allow_single_cluster = True db = HDBSCAN(metric='precomputed', min_cluster_size=min_cluster_size_for_hdbscan, min_samples=1, allow_single_cluster=allow_single_cluster) db.fit(affinity) return db
def hdbscan(self, min_cluster_size=10, prediction_data=False): """ DBSCAN but allows for varying density clusters and no longer requires epsilon parameter, which is difficult to tune. http://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html Scales slightly worse than DBSCAN, but with a more intuitive parameter. """ hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=prediction_data) if prediction_data: return hdbscan.fit(self._safe_dense(self.matrix)) else: return hdbscan.fit(self.matrix)
def test_hdbscan_allow_single_cluster_with_epsilon(): np.random.seed(0) no_structure = np.random.rand(150, 2) # without epsilon we should see many noise points as children of root. labels = HDBSCAN(min_cluster_size=5, cluster_selection_epsilon=0.0, cluster_selection_method='eom', allow_single_cluster=True).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert(len(unique_labels) == 2) assert(counts[unique_labels == -1] == 46) # for this random seed an epsilon of 0.2 will produce exactly 2 noise # points at that cut in single linkage. labels = HDBSCAN(min_cluster_size=5, cluster_selection_epsilon=0.2, cluster_selection_method='eom', allow_single_cluster=True).fit_predict(no_structure) unique_labels, counts = np.unique(labels, return_counts=True) assert(len(unique_labels) == 2) assert(counts[unique_labels == -1] == 2)