def hnsw_hdbscan(data, d, m=5, ef=50, m0=None, level_mult=None, heuristic=True, balanced_add=True, **kwargs): """Simple implementation for when you don't need incremental updates.""" n = len(data) distance_matrix = scipy.sparse.lil_matrix((n, n)) def decorated_d(i, j): res = d(data[i], data[j]) distance_matrix[i, j] = distance_matrix[j, i] = res return res the_hnsw = hnsw.HNSW(decorated_d, m, ef, m0, level_mult, heuristic) add = the_hnsw.balanced_add if balanced_add else the_hnsw.add for i in range(len(data)): add(i) return hdbscan.hdbscan(distance_matrix, metric='precomputed', **kwargs)
def hdbscan_parameter_search(X, min_cluster_size_min=4, min_cluster_size_max=10, min_samples_min=4, min_samples_max=10, target_label_min=5, target_label_max=260, cluster_selection_method='leaf'): # Brute force over combinations. sizes = range(min_cluster_size_min, min_cluster_size_max) ranges = range(min_samples_min, min_samples_max) all_combinations = ((c, s) for c in sizes for s in ranges) logging.info('Searching for clusters with: %d <= label_max <= %d', target_label_min, target_label_max) cluster_params = [] label_values = [] cluster_bincounts = [] # Calculate results of hdbscan for every combination, save results. for min_cluster_size, min_samples in all_combinations: labels, *rest = hdbscan.hdbscan( X, approx_min_span_tree=False, cluster_selection_method=cluster_selection_method, min_cluster_size=min_cluster_size, min_samples=min_samples) true_labels = [label for label in labels if label != -1] try: label_max = np.max(true_labels) except Exception as ex: logging.error('(%s) labels %s', ex, set(labels)) continue if label_max >= target_label_min and label_max <= target_label_max: label_values.append(label_max) cluster_params.append((min_cluster_size, min_samples)) cluster_bincounts.append(np.bincount(true_labels)) label_values = list(reversed(label_values)) cluster_params = list(reversed(cluster_params)) cluster_bincounts = list(reversed(cluster_bincounts)) # Select a solution of the clustering problem that has the most clusters. if len(label_values) > 0: i = np.argmax(label_values) min_cluster_size_opt, min_samples_opt = cluster_params[i] label_max = label_values[i] # Print out all the solutions with the max number of clusters. for i, label in enumerate(label_values): if label == label_max: logging.info('%d %d %d %s' % (*cluster_params[i], label, cluster_bincounts[i])) logging.info('label_max = %d, min_cluster_size = %d, min_samples = %d', label_max, min_cluster_size_opt, min_samples_opt) return min_cluster_size_opt, min_samples_opt
def test_hdbscan_no_clusters(): labels, p, ctree, ltree, mtree = hdbscan(X, min_cluster_size=len(X)+1) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, 0) labels = HDBSCAN(min_cluster_size=len(X)+1).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, 0)
def test_hdbscan_feature_vector(): labels, p, persist, ctree, ltree, mtree = hdbscan(X) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN().fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_hdbscan_no_clusters(): labels, p, persist, ctree, ltree, mtree = hdbscan(X, min_cluster_size=len(X)+1) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, 0) labels = HDBSCAN(min_cluster_size=len(X)+1).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, 0)
def test_hdbscan_boruvka_balltree_matches(): data = generate_noisy_data() labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm='generic') labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm='boruvka_balltree') num_mismatches = homogeneity(labels_prims, labels_boruvka) assert_less(num_mismatches / float(data.shape[0]), 0.05) labels_prims = HDBSCAN(algorithm='generic').fit_predict(data) labels_boruvka = HDBSCAN(algorithm='boruvka_balltree').fit_predict(data) num_mismatches = homogeneity(labels_prims, labels_boruvka) assert_less(num_mismatches / float(data.shape[0]), 0.05)
def test_hdbscan_boruvka_balltree_matches(): data = generate_noisy_data() labels_prims, p, ctree, ltree, mtree = hdbscan(data, algorithm='generic') labels_boruvka, p, ctree, ltree, mtree = hdbscan(data, algorithm='boruvka_balltree') num_mismatches = homogeneity(labels_prims, labels_boruvka) assert_less(num_mismatches / float(data.shape[0]), 0.015) labels_prims = HDBSCAN(algorithm='generic').fit_predict(data) labels_boruvka = HDBSCAN(algorithm='boruvka_balltree').fit_predict(data) num_mismatches = homogeneity(labels_prims, labels_boruvka) assert_less(num_mismatches / float(data.shape[0]), 0.015)
def test_hdbscan_best_balltree_metric(): labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric='seuclidean', V=np.ones(X.shape[1])) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(metric='seuclidean', V=np.ones(X.shape[1])).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_hdbscan_generic(): labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="generic") n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters labels = HDBSCAN(algorithm="generic", gen_min_span_tree=True).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters
def test_hdbscan_generic(): labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm='generic') n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(algorithm='generic', gen_min_span_tree=True).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_hdbscan_boruvka_balltree_matches(): data = generate_noisy_data() labels_prims, p, persist, ctree, ltree, mtree = hdbscan( data, algorithm="generic") labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan( data, algorithm="boruvka_balltree") num_mismatches = homogeneity(labels_prims, labels_boruvka) assert (num_mismatches / float(data.shape[0])) < 0.15 labels_prims = HDBSCAN(algorithm="generic").fit_predict(data) labels_boruvka = HDBSCAN(algorithm="boruvka_balltree").fit_predict(data) num_mismatches = homogeneity(labels_prims, labels_boruvka) assert (num_mismatches / float(data.shape[0])) < 0.15
def test_hdbscan_callable_metric(): # metric is the function reference, not the string key. metric = distance.euclidean labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric=metric) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(metric=metric).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_hdbscan_callable_metric(): # metric is the function reference, not the string key. metric = distance.euclidean labels = hdbscan(X, metric=metric) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(metric=metric).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_hdbscan_callable_metric(): # metric is the function reference, not the string key. metric = distance.euclidean labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric=metric) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters labels = HDBSCAN(metric=metric).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters
def test_hdbscan_feature_vector(): labels, p, persist, ctree, ltree, mtree = hdbscan(X) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters labels = HDBSCAN().fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters validity = validity_index(X, labels) assert validity >= 0.4
def test_hdbscan_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) # H, y = shuffle(X, y, random_state=7) H = StandardScaler().fit_transform(H) labels, p, persist, ctree, ltree, mtree = hdbscan(H) n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(algorithm='best', metric='seuclidean', V=np.ones(H.shape[1])).fit(H).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_hdbscan_distance_matrix(): D = distance.squareform(distance.pdist(X)) D /= np.max(D) labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric='precomputed') # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(metric="precomputed").fit(D).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_hdbscan_min_cluster_size(): for min_cluster_size in range(2, len(X) + 1, 1): labels, p, persist, ctree, ltree, mtree = hdbscan( X, min_cluster_size=min_cluster_size) true_labels = [label for label in labels if label != -1] if len(true_labels) != 0: assert np.min(np.bincount(true_labels)) >= min_cluster_size labels = HDBSCAN(min_cluster_size=min_cluster_size).fit(X).labels_ true_labels = [label for label in labels if label != -1] if len(true_labels) != 0: assert np.min(np.bincount(true_labels)) >= min_cluster_size
def test_hdbscan_min_cluster_size(): for min_cluster_size in range(2, len(X)+1, 1): labels, p, persist, ctree, ltree, mtree = hdbscan( X, min_cluster_size=min_cluster_size) true_labels = [label for label in labels if label != -1] if len(true_labels) != 0: assert_greater_equal(np.min(np.bincount(true_labels)), min_cluster_size) labels = HDBSCAN(min_cluster_size=min_cluster_size).fit(X).labels_ true_labels = [label for label in labels if label != -1] if len(true_labels) != 0: assert_greater_equal(np.min(np.bincount(true_labels)), min_cluster_size)
def test_hdbscan_distance_matrix(): D = distance.squareform(distance.pdist(X)) D /= np.max(D) labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric="precomputed") # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise assert n_clusters_1 == n_clusters labels = HDBSCAN(metric="precomputed").fit(D).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters validity = validity_index(D, labels, metric="precomputed", d=2) assert validity >= 0.6
def test_hdbscan_boruvka_balltree(): labels, p, persist, ctree, ltree, mtree = hdbscan( X, algorithm='boruvka_balltree') n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(algorithm='boruvka_balltree', gen_min_span_tree=True).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters) assert_raises(ValueError, hdbscan, X, algorithm='boruvka_balltree', metric='cosine')
def test_hdbscan_boruvka_balltree(): labels, p, persist, ctree, ltree, mtree = hdbscan( X, algorithm="boruvka_balltree") n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters labels = (HDBSCAN(algorithm="boruvka_balltree", gen_min_span_tree=True).fit(X).labels_) n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters assert_raises(ValueError, hdbscan, X, algorithm="boruvka_balltree", metric="cosine")
def test_hdbscan_prims_kdtree(): labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="prims_kdtree") n_clusters_1 = len(set(labels)) - int(-1 in labels) assert n_clusters_1 == n_clusters labels = HDBSCAN(algorithm="prims_kdtree", gen_min_span_tree=True).fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters assert_raises(ValueError, hdbscan, X, algorithm="prims_kdtree", metric="russelrao")
def test_hdbscan_sparse_distance_matrix(): D = distance.squareform(distance.pdist(X)) D /= np.max(D) threshold = stats.scoreatpercentile(D.flatten(), 50) D[D >= threshold] = 0.0 D = sparse.csr_matrix(D) D.eliminate_zeros() labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric='precomputed') # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise assert_equal(n_clusters_1, n_clusters) labels = HDBSCAN(metric="precomputed").fit(D).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
#all the different n-grams in the texts #terms = tfidf_vectorizer.get_feature_names() #clusterer = hdbscan.HDBSCAN(min_cluster_size=2) #result = clusterer.fit_predict(tfidf_matrix) X = tfidf_matrix.todense() t4 = time.time() print("time to convert tf idf sparse matrix to dense matkrix: " + str(t4-t3)) labels, probabilities,cluster_persistence,condensed_tree,single_linkage_tree,min_spanning_tree = hdbscan.hdbscan(X = X, min_cluster_size=4) t5 = time.time() print("time to apply HDBSCAN: " + str(t5-t4)) ''' print(labels) print() print(probabilities) print() print(cluster_persistence) print() print(condensed_tree) print() print(single_linkage_tree)