Ejemplo n.º 1
0
def hnsw_hdbscan(data,
                 d,
                 m=5,
                 ef=50,
                 m0=None,
                 level_mult=None,
                 heuristic=True,
                 balanced_add=True,
                 **kwargs):
    """Simple implementation for when you don't need incremental updates."""

    n = len(data)
    distance_matrix = scipy.sparse.lil_matrix((n, n))

    def decorated_d(i, j):
        res = d(data[i], data[j])
        distance_matrix[i, j] = distance_matrix[j, i] = res
        return res

    the_hnsw = hnsw.HNSW(decorated_d, m, ef, m0, level_mult, heuristic)
    add = the_hnsw.balanced_add if balanced_add else the_hnsw.add
    for i in range(len(data)):
        add(i)

    return hdbscan.hdbscan(distance_matrix, metric='precomputed', **kwargs)
Ejemplo n.º 2
0
def hdbscan_parameter_search(X,
                             min_cluster_size_min=4,
                             min_cluster_size_max=10,
                             min_samples_min=4,
                             min_samples_max=10,
                             target_label_min=5,
                             target_label_max=260,
                             cluster_selection_method='leaf'):
    # Brute force over combinations.
    sizes = range(min_cluster_size_min, min_cluster_size_max)
    ranges = range(min_samples_min, min_samples_max)
    all_combinations = ((c, s) for c in sizes for s in ranges)

    logging.info('Searching for clusters with: %d <= label_max <= %d',
                 target_label_min, target_label_max)
    cluster_params = []
    label_values = []
    cluster_bincounts = []

    # Calculate results of hdbscan for every combination, save results.
    for min_cluster_size, min_samples in all_combinations:
        labels, *rest = hdbscan.hdbscan(
            X,
            approx_min_span_tree=False,
            cluster_selection_method=cluster_selection_method,
            min_cluster_size=min_cluster_size,
            min_samples=min_samples)
        true_labels = [label for label in labels if label != -1]
        try:
            label_max = np.max(true_labels)
        except Exception as ex:
            logging.error('(%s) labels %s', ex, set(labels))
            continue

        if label_max >= target_label_min and label_max <= target_label_max:
            label_values.append(label_max)
            cluster_params.append((min_cluster_size, min_samples))
            cluster_bincounts.append(np.bincount(true_labels))

    label_values = list(reversed(label_values))
    cluster_params = list(reversed(cluster_params))
    cluster_bincounts = list(reversed(cluster_bincounts))

    # Select a solution of the clustering problem that has the most clusters.
    if len(label_values) > 0:
        i = np.argmax(label_values)
        min_cluster_size_opt, min_samples_opt = cluster_params[i]
        label_max = label_values[i]

    # Print out all the solutions with the max number of clusters.
    for i, label in enumerate(label_values):
        if label == label_max:
            logging.info('%d %d %d %s' %
                         (*cluster_params[i], label, cluster_bincounts[i]))

    logging.info('label_max = %d, min_cluster_size = %d, min_samples = %d',
                 label_max, min_cluster_size_opt, min_samples_opt)

    return min_cluster_size_opt, min_samples_opt
Ejemplo n.º 3
0
def test_hdbscan_no_clusters():
    labels, p, ctree, ltree, mtree = hdbscan(X, min_cluster_size=len(X)+1)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, 0)
    
    labels = HDBSCAN(min_cluster_size=len(X)+1).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, 0)
Ejemplo n.º 4
0
def test_hdbscan_feature_vector():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN().fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 5
0
def test_hdbscan_no_clusters():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X, min_cluster_size=len(X)+1)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, 0)
    
    labels = HDBSCAN(min_cluster_size=len(X)+1).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, 0)
Ejemplo n.º 6
0
def test_hdbscan_feature_vector():    
    labels, p, persist, ctree, ltree, mtree = hdbscan(X)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN().fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 7
0
def test_hdbscan_boruvka_balltree_matches():

    data = generate_noisy_data()

    labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm='generic')
    labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm='boruvka_balltree')

    num_mismatches = homogeneity(labels_prims,  labels_boruvka)

    assert_less(num_mismatches / float(data.shape[0]), 0.05)

    labels_prims = HDBSCAN(algorithm='generic').fit_predict(data)
    labels_boruvka = HDBSCAN(algorithm='boruvka_balltree').fit_predict(data)

    num_mismatches = homogeneity(labels_prims,  labels_boruvka)

    assert_less(num_mismatches / float(data.shape[0]), 0.05)
Ejemplo n.º 8
0
def test_hdbscan_boruvka_balltree_matches():

    data = generate_noisy_data()

    labels_prims, p, ctree, ltree, mtree = hdbscan(data, algorithm='generic')
    labels_boruvka, p, ctree, ltree, mtree = hdbscan(data, algorithm='boruvka_balltree')

    num_mismatches = homogeneity(labels_prims,  labels_boruvka)

    assert_less(num_mismatches / float(data.shape[0]), 0.015)

    labels_prims = HDBSCAN(algorithm='generic').fit_predict(data)
    labels_boruvka = HDBSCAN(algorithm='boruvka_balltree').fit_predict(data)

    num_mismatches = homogeneity(labels_prims,  labels_boruvka)

    assert_less(num_mismatches / float(data.shape[0]), 0.015)
Ejemplo n.º 9
0
def test_hdbscan_best_balltree_metric():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric='seuclidean',
                                                      V=np.ones(X.shape[1]))
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric='seuclidean', V=np.ones(X.shape[1])).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 10
0
def test_hdbscan_generic():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm="generic")
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_1 == n_clusters

    labels = HDBSCAN(algorithm="generic",
                     gen_min_span_tree=True).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_2 == n_clusters
Ejemplo n.º 11
0
def test_hdbscan_generic():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm='generic')
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(algorithm='generic',
                     gen_min_span_tree=True).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 12
0
def test_hdbscan_best_balltree_metric():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric='seuclidean',
                                                      V=np.ones(X.shape[1]))
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric='seuclidean', V=np.ones(X.shape[1])).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 13
0
def test_hdbscan_generic():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm='generic')
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(algorithm='generic',
                     gen_min_span_tree=True).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 14
0
def test_hdbscan_boruvka_balltree_matches():

    data = generate_noisy_data()

    labels_prims, p, persist, ctree, ltree, mtree = hdbscan(
        data, algorithm="generic")
    labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan(
        data, algorithm="boruvka_balltree")

    num_mismatches = homogeneity(labels_prims, labels_boruvka)

    assert (num_mismatches / float(data.shape[0])) < 0.15

    labels_prims = HDBSCAN(algorithm="generic").fit_predict(data)
    labels_boruvka = HDBSCAN(algorithm="boruvka_balltree").fit_predict(data)

    num_mismatches = homogeneity(labels_prims, labels_boruvka)

    assert (num_mismatches / float(data.shape[0])) < 0.15
Ejemplo n.º 15
0
def test_hdbscan_callable_metric():
    # metric is the function reference, not the string key.
    metric = distance.euclidean

    labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric=metric)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric=metric).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 16
0
def test_hdbscan_callable_metric():
    # metric is the function reference, not the string key.
    metric = distance.euclidean

    labels = hdbscan(X, metric=metric)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric=metric).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 17
0
def test_hdbscan_callable_metric():
    # metric is the function reference, not the string key.
    metric = distance.euclidean

    labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric=metric)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_1 == n_clusters

    labels = HDBSCAN(metric=metric).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_2 == n_clusters
Ejemplo n.º 18
0
def test_hdbscan_feature_vector():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_1 == n_clusters

    labels = HDBSCAN().fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_2 == n_clusters

    validity = validity_index(X, labels)
    assert validity >= 0.4
Ejemplo n.º 19
0
def test_hdbscan_high_dimensional():
    H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
    # H, y = shuffle(X, y, random_state=7)
    H = StandardScaler().fit_transform(H)
    labels, p, persist, ctree, ltree, mtree = hdbscan(H)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(algorithm='best', metric='seuclidean',
                     V=np.ones(H.shape[1])).fit(H).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 20
0
def test_hdbscan_high_dimensional():
    H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
    # H, y = shuffle(X, y, random_state=7)
    H = StandardScaler().fit_transform(H)
    labels, p, persist, ctree, ltree, mtree = hdbscan(H)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(algorithm='best', metric='seuclidean',
                     V=np.ones(H.shape[1])).fit(H).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 21
0
def test_hdbscan_distance_matrix():
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)
    
    labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric='precomputed')
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric="precomputed").fit(D).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 22
0
def test_hdbscan_distance_matrix():
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)

    labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric='precomputed')
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)  # ignore noise
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric="precomputed").fit(D).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 23
0
def test_hdbscan_min_cluster_size():
    for min_cluster_size in range(2, len(X) + 1, 1):
        labels, p, persist, ctree, ltree, mtree = hdbscan(
            X, min_cluster_size=min_cluster_size)
        true_labels = [label for label in labels if label != -1]
        if len(true_labels) != 0:
            assert np.min(np.bincount(true_labels)) >= min_cluster_size

        labels = HDBSCAN(min_cluster_size=min_cluster_size).fit(X).labels_
        true_labels = [label for label in labels if label != -1]
        if len(true_labels) != 0:
            assert np.min(np.bincount(true_labels)) >= min_cluster_size
Ejemplo n.º 24
0
def test_hdbscan_min_cluster_size():
    for min_cluster_size in range(2, len(X)+1, 1):
        labels, p, persist, ctree, ltree, mtree = hdbscan(
            X, min_cluster_size=min_cluster_size)
        true_labels = [label for label in labels if label != -1]
        if len(true_labels) != 0:
            assert_greater_equal(np.min(np.bincount(true_labels)),
                                 min_cluster_size)

        labels = HDBSCAN(min_cluster_size=min_cluster_size).fit(X).labels_
        true_labels = [label for label in labels if label != -1]
        if len(true_labels) != 0:
            assert_greater_equal(np.min(np.bincount(true_labels)),
                                 min_cluster_size)
Ejemplo n.º 25
0
def test_hdbscan_distance_matrix():
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)

    labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric="precomputed")
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)  # ignore noise
    assert n_clusters_1 == n_clusters

    labels = HDBSCAN(metric="precomputed").fit(D).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_2 == n_clusters

    validity = validity_index(D, labels, metric="precomputed", d=2)
    assert validity >= 0.6
Ejemplo n.º 26
0
def test_hdbscan_boruvka_balltree():
    labels, p, persist, ctree, ltree, mtree = hdbscan(
        X, algorithm='boruvka_balltree')
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(algorithm='boruvka_balltree',
                     gen_min_span_tree=True).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

    assert_raises(ValueError,
                  hdbscan,
                  X,
                  algorithm='boruvka_balltree',
                  metric='cosine')
Ejemplo n.º 27
0
def test_hdbscan_boruvka_balltree():
    labels, p, persist, ctree, ltree, mtree = hdbscan(
        X, algorithm="boruvka_balltree")
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_1 == n_clusters

    labels = (HDBSCAN(algorithm="boruvka_balltree",
                      gen_min_span_tree=True).fit(X).labels_)
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_2 == n_clusters

    assert_raises(ValueError,
                  hdbscan,
                  X,
                  algorithm="boruvka_balltree",
                  metric="cosine")
Ejemplo n.º 28
0
def test_hdbscan_prims_kdtree():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X,
                                                      algorithm="prims_kdtree")
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_1 == n_clusters

    labels = HDBSCAN(algorithm="prims_kdtree",
                     gen_min_span_tree=True).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_2 == n_clusters

    assert_raises(ValueError,
                  hdbscan,
                  X,
                  algorithm="prims_kdtree",
                  metric="russelrao")
Ejemplo n.º 29
0
def test_hdbscan_boruvka_balltree():
    labels, p, persist, ctree, ltree, mtree = hdbscan(
        X, algorithm='boruvka_balltree')
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(algorithm='boruvka_balltree',
                     gen_min_span_tree=True).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

    assert_raises(ValueError,
                  hdbscan,
                  X,
                  algorithm='boruvka_balltree',
                  metric='cosine')
Ejemplo n.º 30
0
def test_hdbscan_sparse_distance_matrix():
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)

    threshold = stats.scoreatpercentile(D.flatten(), 50)

    D[D >= threshold] = 0.0
    D = sparse.csr_matrix(D)
    D.eliminate_zeros()

    labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric='precomputed')
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)  # ignore noise
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric="precomputed").fit(D).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 31
0
def test_hdbscan_sparse_distance_matrix():
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)

    threshold = stats.scoreatpercentile(D.flatten(), 50)

    D[D >= threshold] = 0.0
    D = sparse.csr_matrix(D)
    D.eliminate_zeros()

    labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric='precomputed')
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)  # ignore noise
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric="precomputed").fit(D).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Ejemplo n.º 32
0
#all the different n-grams in the texts
#terms = tfidf_vectorizer.get_feature_names()



#clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
#result = clusterer.fit_predict(tfidf_matrix)


X = tfidf_matrix.todense()
t4 = time.time()

print("time to convert tf idf sparse matrix to dense matkrix: " + str(t4-t3))

labels, probabilities,cluster_persistence,condensed_tree,single_linkage_tree,min_spanning_tree  = hdbscan.hdbscan(X = X, min_cluster_size=4)

t5 = time.time()

print("time to apply HDBSCAN: " + str(t5-t4))

'''
print(labels)
print()
print(probabilities)
print()
print(cluster_persistence)
print()
print(condensed_tree)
print()
print(single_linkage_tree)