Python HDBSCAN.HDBSCAN Examples

Programming Language: Python

Namespace/Package Name: hdbscan

Class/Type: HDBSCAN

Method/Function: HDBSCAN

Examples at hotexamples.com: 30

HDBSCAN is a Python library that provides an efficient implementation of the HDBSCAN algorithm. HDBSCAN stands for Hierarchical Density-Based Spatial Clustering of Applications with Noise. This algorithm is capable of discovering clusters of varying density within a dataset, even in the presence of noise or outliers. HDBSCAN improves upon traditional clustering algorithms by allowing clusters to have different shapes and densities, and by automatically determining the number of clusters in the data. It is a valuable tool for exploratory data analysis and can be used in a variety of applications such as image processing, bioinformatics, and anomaly detection.

Python HDBSCAN.HDBSCAN - 30 examples found. These are the top rated real world Python examples of hdbscan.HDBSCAN.HDBSCAN extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HDBSCAN(30)

fit(30)

fit_predict(23)

generate_prediction_data(2)

predict(2)

dbscan_clustering(1)

get_params(1)

ne(1)

predict_proba(1)

weighted_cluster_centroid(1)

weighted_cluster_medoid(1)

Example #1

Show file

File: test_hdbscan.py Project: xiaofei6677/hdbscan

def test_hdbscan_no_clusters():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X,
                                                      min_cluster_size=len(X) +
                                                      1)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, 0)

    labels = HDBSCAN(min_cluster_size=len(X) + 1).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, 0)

Example #2

Show file

def test_hdbscan_boruvka_balltree_matches():

    data = generate_noisy_data()

    labels_prims, p, persist, ctree, ltree, mtree = hdbscan(
        data, algorithm='generic')
    labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan(
        data, algorithm='boruvka_balltree')

    num_mismatches = homogeneity(labels_prims, labels_boruvka)

    assert_less(num_mismatches / float(data.shape[0]), 0.15)

    labels_prims = HDBSCAN(algorithm='generic').fit_predict(data)
    labels_boruvka = HDBSCAN(algorithm='boruvka_balltree').fit_predict(data)

    num_mismatches = homogeneity(labels_prims, labels_boruvka)

    assert_less(num_mismatches / float(data.shape[0]), 0.15)

Example #3

Show file

File: test_hdbscan.py Project: xiaofei6677/hdbscan

def test_hdbscan_best_balltree_metric():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X,
                                                      metric='seuclidean',
                                                      V=np.ones(X.shape[1]))
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric='seuclidean', V=np.ones(X.shape[1])).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

Example #4

Show file

def test_condensed_tree_plot():
    clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
    if_matplotlib(clusterer.condensed_tree_.plot)(
        select_clusters=True,
        label_clusters=True,
        selection_palette=('r', 'g', 'b'),
        cmap='Reds')
    if_matplotlib(clusterer.condensed_tree_.plot)(log_size=True,
                                                  colorbar=False,
                                                  cmap='none')

Example #5

Show file

File: test_hdbscan.py Project: Natural209X/hdbscan

def test_tree_output_formats():

    clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
    clusterer.condensed_tree_.to_pandas()
    clusterer.condensed_tree_.to_networkx()
    clusterer.single_linkage_tree_.to_pandas()
    clusterer.single_linkage_tree_.to_networkx()
    clusterer.single_linkage_tree_.to_numpy()
    clusterer.minimum_spanning_tree_.to_pandas()
    clusterer.minimum_spanning_tree_.to_networkx()

Example #6

Show file

def test_min_span_tree_plot():
    clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
    if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap='Reds')

    H, y = make_blobs(n_samples=50, random_state=0, n_features=10)
    H = StandardScaler().fit_transform(H)

    clusterer = HDBSCAN(gen_min_span_tree=True).fit(H)
    if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap='Reds',
                                                         vary_line_width=False,
                                                         colorbar=False)

    H, y = make_blobs(n_samples=50, random_state=0, n_features=40)
    H = StandardScaler().fit_transform(H)

    clusterer = HDBSCAN(gen_min_span_tree=True).fit(H)
    if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap='Reds',
                                                         vary_line_width=False,
                                                         colorbar=False)

Example #7

Show file

def test_hdbscan_approximate_predict_score():
    clusterer = HDBSCAN(min_cluster_size=200).fit(X)
    # no prediction data error
    assert_raises(ValueError, approximate_predict_scores, clusterer, X)
    clusterer.generate_prediction_data()
    # wrong dimensions error
    assert_raises(ValueError, approximate_predict_scores, clusterer,
                  np.array([[1, 2, 3]]))
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        approximate_predict_scores(clusterer, np.array([[1.5, -1.0]]))
        # no clusters warning
        assert "Clusterer does not have any defined clusters" in str(
            w[-1].message)
    clusterer = HDBSCAN(prediction_data=True).fit(X)
    scores = approximate_predict_scores(clusterer, X)
    assert_array_almost_equal(scores, clusterer.outlier_scores_)
    assert scores.min() >= 0
    assert scores.max() <= 1

Example #8

Show file

def test_hdbscan_boruvka_kdtree_matches():

    data = generate_noisy_data()

    labels_prims, p, persist, ctree, ltree, mtree = hdbscan(
        data, algorithm="generic")
    labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan(
        data, algorithm="boruvka_kdtree")

    num_mismatches = homogeneity(labels_prims, labels_boruvka)

    assert (num_mismatches / float(data.shape[0])) < 0.15

    labels_prims = HDBSCAN(algorithm="generic").fit_predict(data)
    labels_boruvka = HDBSCAN(algorithm="boruvka_kdtree").fit_predict(data)

    num_mismatches = homogeneity(labels_prims, labels_boruvka)

    assert (num_mismatches / float(data.shape[0])) < 0.15

Example #9

Show file

File: similarity.py Project: karthi2016/pipeline_word2vec

def hdbscan_clustering(S, X, config):
    '''
    Computes H-DBSCAN clustering from an input similarity matrix.
    Returns the labels associated with the clustering.
    '''
    from hdbscan import HDBSCAN

    min_size = config.as_int("min_cluster_size")
    clf = HDBSCAN(min_cluster_size=min_size)
    return clf.fit_predict(X)

Example #10

Show file

def get_hotspots(**kwargs):
    '''Return the stable clusters from the condensed tree of connected components from the density graph'''
    print(' * HDBSCAN clustering data with ' +
          str(multiprocessing.cpu_count()) + ' cores...')
    config = {
        'core_dist_n_jobs': multiprocessing.cpu_count(),
        'min_cluster_size': kwargs['min_cluster_size'],
        'cluster_selection_epsilon': 0.01,
        'min_samples': 1,
        'approx_min_span_tree': False,
    }
    v = kwargs['vecs']
    z = HDBSCAN(**config).fit(v)
    # find the points in each cluster
    d = defaultdict(list)
    for idx, i in enumerate(z.labels_):
        d[i].append(v[idx])
    # find the convex hull for each cluster's points
    convex_hulls = []
    for i in d:
        hull = ConvexHull(d[i])
        points = [hull.points[j] for j in hull.vertices]
        # the last convex hull simplex needs to connect back to the first point
        convex_hulls.append(np.vstack([points, points[0]]))
    # find the centroids for each cluster
    centroids = []
    for i in d:
        x, y = np.array(d[i]).T
        centroids.append(np.array([np.mean(x), np.mean(y)]))
    # identify the number of points in each cluster
    lens = [len(d[i]) for i in d]
    # combine data into cluster objects
    closest, _ = pairwise_distances_argmin_min(centroids, v)
    paths = [kwargs['image_paths'][i] for i in closest]
    clusters = [{
        'img': clean_filename(paths[idx]),
        'convex_hull': convex_hulls[idx].tolist(),
        'n_images': lens[idx],
    } for idx, i in enumerate(closest)]
    # remove massive clusters
    retained = []
    for idx, i in enumerate(clusters):
        x, y = np.array(i['convex_hull']).T
        area = 0.5 * np.abs(
            np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
        if area < 0.2:
            retained.append(i)
    # sort the clusers by size
    clusters = sorted(retained, key=lambda i: i['n_images'], reverse=True)
    for idx, i in enumerate(clusters):
        i['label'] = 'Cluster {}'.format(idx + 1)
    # save the hotspots to disk and return the path to the saved json
    print(' * found', len(clusters), 'hotspots')
    return write_json(get_path('hotspots', 'hotspot', **kwargs), clusters,
                      **kwargs)

Example #11

Show file

File: cluster_evaluation.py Project: dpacassi/dynamic-event-detection-in-data-streams

 def hdbscan(self, args):
     start = time.time()
     model = HDBSCAN(
         min_cluster_size=args["min_cluster_size"],
         metric=args["metric"],
         leaf_size=args["leaf_size"],
         allow_single_cluster=args["allow_single_cluster"],
     ).fit(self.data_matrix)
     labels = model.predict(self.data_matrix)
     end = time.time()
     return labels, (end - start)

Example #12

Show file

File: test_hdbscan.py Project: amsqr/hdbscan

def test_hdbscan_membership_vector():
    clusterer = HDBSCAN(prediction_data=True).fit(X)
    vector = membership_vector(clusterer, np.array([[-1.5, -1.0]]))
    assert_array_almost_equal(vector,
                              np.array([[0.05705305, 0.05974177, 0.12228153]]))
    vector = membership_vector(clusterer, np.array([[1.5, -1.0]]))
    assert_array_almost_equal(vector,
                              np.array([[0.09462176, 0.32061556, 0.10112905]]))
    vector = membership_vector(clusterer, np.array([[0.0, 0.0]]))
    assert_array_almost_equal(vector,
                              np.array([[0.03545607, 0.03363318, 0.04643177]]))

Example #13

Show file

File: cluster.py Project: soar200/tools

def hdbscan_cluster(params):
    print("LOAD CORPUS START")
    content = read_file(params.train_file)

    train_feature = pickle.load(open(params.feature_file, 'rb'))
    indices = list(range(len(train_feature)))

    import random
    SEED = 42
    random.seed(SEED)
    random.shuffle(indices)
    if params.sample_number >= len(train_feature):
        print("sample_number:" + str(len(train_feature)))
    indices = indices[:params.sample_number]
    content = np.array(content)[indices].tolist()
    train_feature = np.array(train_feature)[indices].tolist()

    from hdbscan import HDBSCAN
    print("HDBSCAN STARTING....")
    hdb = HDBSCAN(min_samples=1).fit(train_feature)
    data_dir, _ = os.path.split(params.train_file)
    pickle.dump(
        hdb,
        open(
            os.path.join(data_dir,
                         params.method + str(params.sample_number) + '.obj'),
            'wb'))

    sample_labels = hdb.labels_
    n_clusters_hdb_ = len(
        set(sample_labels)) - (1 if -1 in sample_labels else 0)
    print('\n\n++ HDBSCAN Results')
    print('Estimated number of clusters: %d' % n_clusters_hdb_)
    with open(params.train_file + params.method + str(params.sample_number),
              'w') as file_w:
        for idx, line in enumerate(content):
            file_w.write(str(sample_labels[idx]) + '\t' + line + '\n')

    #可能需要降维,聚类可视化
    #import matplotlib.pyplot as plt
    #hdb_unique_labels = set(hdb_labels)
    #hdb_colors = plt.cm.Spectral(np.linspace(0, 1, len(hdb_unique_labels)))
    #fig = plt.figure(figsize=plt.figaspect(0.5))
    #hdb_axis = fig.add_subplot('111')

    #for k, col in zip(hdb_unique_labels, hdb_colors):
    #    if k == -1:
    #        # Black used for noise.
    #        col = 'k'
    #        hdb_axis.plot(X[hdb_labels == k, 0], X[hdb_labels == k, 1], 'o', markerfacecolor=col,
    #                markeredgecolor='k', markersize=6)

    #hdb_axis.set_title('HDBSCAN\nEstimated number of clusters: %d' % n_clusters_hdb_)
    pass

Example #14

Show file

def test_hdbscan_centroids_medoids():
    centers = [(0.0, 0.0), (3.0, 3.0)]
    H, y = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5)
    clusterer = HDBSCAN().fit(H)

    for idx, center in enumerate(centers):
        centroid = clusterer.weighted_cluster_centroid(idx)
        assert_array_almost_equal(centroid, center, decimal=1)

        medoid = clusterer.weighted_cluster_medoid(idx)
        assert_array_almost_equal(medoid, center, decimal=1)

Example #15

Show file

File: test_hdbscan.py Project: zachwill/hdbscan

def test_hdbscan_callable_metric():
    # metric is the function reference, not the string key.
    metric = distance.euclidean

    labels = hdbscan(X, metric=metric)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric=metric).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

Example #16

Show file

File: test_hdbscan.py Project: Pandinosaurus/hdbscan

def test_condensed_tree_plot():
    clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
    if_matplotlib(clusterer.condensed_tree_.plot)(
        select_clusters=True,
        label_clusters=True,
        selection_palette=("r", "g", "b"),
        cmap="Reds",
    )
    if_matplotlib(clusterer.condensed_tree_.plot)(log_size=True,
                                                  colorbar=False,
                                                  cmap="none")

Example #17

Show file

 def fit(self, data, min_cluster_size, min_samples, alpha,
         cluster_selection_method):
     data = np.array(data)
     data = preprocessing.MinMaxScaler().fit_transform(data)
     model = HDBSCAN(min_cluster_size=min_cluster_size,
                     min_samples=min_samples,
                     alpha=alpha,
                     cluster_selection_method=cluster_selection_method,
                     allow_single_cluster=True)
     clustering = model.fit(data)
     return clustering

Example #18

Show file

File: test_hdbscan.py Project: Pandinosaurus/hdbscan

def test_hdbscan_callable_metric():
    # metric is the function reference, not the string key.
    metric = distance.euclidean

    labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric=metric)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_1 == n_clusters

    labels = HDBSCAN(metric=metric).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_2 == n_clusters

Example #19

Show file

File: test_hdbscan.py Project: Pandinosaurus/hdbscan

def test_hdbscan_feature_vector():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_1 == n_clusters

    labels = HDBSCAN().fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_2 == n_clusters

    validity = validity_index(X, labels)
    assert validity >= 0.4

Example #20

Show file

def test_hdbscan_feature_vector():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN().fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

    validity = validity_index(X, labels)
    assert_greater_equal(validity, 0.4)

Example #21

Show file

def run_hdbscan(X, COLS, MCS, MS, DF, y):
    """
    Run hdbscan for given data and combination of Min Cluster Size, Min Samples

    Parameters
    ----------
    X: pandas.DataFrame
        The input data to be clustered

    COLS: list
        The columns from X to be considered

    MCS: int
        Min Cluster Size, input to HDBSCAN()

    MS: int
        Min Samples, input to HDBSCAN()

    DF: pandas.DataFrame
        Data for profiling

    Returns
    -------
    Dict with the profile and labels
    """
    hdb = \
    (HDBSCAN(
        min_cluster_size=MCS,
        min_samples=MS)
     .fit(X[COLS])
    )

    df = \
    (DF
     .join(y)
     .assign(clus = hdb.labels_)
     .query("clus != -1")
    )

    profile = \
    (df
     .groupby('clus')
     .mean()
     .join(Series(hdb.labels_, name='size').value_counts(normalize=True))
     .T
     .round(2)
     .loc[COLS + ['size', 'y'], :]
    )

    return {
        'profile': profile.loc[['size', 'y']].T.sort_values('y').query("y > 0.2"),
        'labels': hdb.labels_
    }

Example #22

Show file

File: test_hdbscan.py Project: Pandinosaurus/hdbscan

def test_hdbscan_min_cluster_size():
    for min_cluster_size in range(2, len(X) + 1, 1):
        labels, p, persist, ctree, ltree, mtree = hdbscan(
            X, min_cluster_size=min_cluster_size)
        true_labels = [label for label in labels if label != -1]
        if len(true_labels) != 0:
            assert np.min(np.bincount(true_labels)) >= min_cluster_size

        labels = HDBSCAN(min_cluster_size=min_cluster_size).fit(X).labels_
        true_labels = [label for label in labels if label != -1]
        if len(true_labels) != 0:
            assert np.min(np.bincount(true_labels)) >= min_cluster_size

Example #23

Show file

def cluster(df, min_size=4, allow_single_cluster=True):
    """Use HDBSCAN --
    (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
    to find the best clusters for the meander.
    """
    clusterer = HDBSCAN(min_cluster_size=min_size,
                        min_samples=3,
                        metric='haversine',
                        allow_single_cluster=allow_single_cluster)
    clusterer.fit(df[['lat', 'lng']])
    df.loc[:, 'label'] = ['ABCDEFGHIJKLMN'[i] for i in clusterer.labels_]
    return df.sort_values('label').reset_index(drop=True)

Example #24

Show file

    def perform_hdbscan(self, min_cluster_size=15):
        hdbscan_clusterer = HDBSCAN(min_cluster_size, metric="precomputed")
        hdbscan_clusterer.fit(self.distance_matrix)
        self.hdbscan_results = {
            "parameters": hdbscan_clusterer.get_params(),
            "labels": hdbscan_clusterer.labels_,
            "probabilities": hdbscan_clusterer.probabilities_,
            "n_clusters": np.unique(hdbscan_clusterer.labels_).max() + 1,
            'clusters': label_cnt_dict(hdbscan_clusterer.labels_)
        }

        print_dict(self.hdbscan_results)

Example #25

Show file

File: alignMessages.py Project: techge/nemesys

    def clusterMessageTypesHDBSCAN(self, min_cluster_size = 10, min_samples = 2) \
            -> Tuple[Dict[int, List[Tuple[MessageSegment]]], numpy.ndarray, HDBSCAN]:
        clusterer = HDBSCAN(metric='precomputed',
                            allow_single_cluster=True,
                            cluster_selection_method='leaf',
                            min_cluster_size=min_cluster_size,
                            min_samples=min_samples)

        print("Messages: HDBSCAN min cluster size:", min_cluster_size,
              "min samples:", min_samples)
        segmentClusters, labels = self._postprocessClustering(clusterer)
        return segmentClusters, labels, clusterer

Example #26

Show file

File: test_hdbscan.py Project: Natural209X/hdbscan

def test_hdbscan_distance_matrix():
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)

    labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric='precomputed')
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)  # ignore noise
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric="precomputed").fit(D).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

Example #27

Show file

def test_hdbscan_high_dimensional():
    H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
    # H, y = shuffle(X, y, random_state=7)
    H = StandardScaler().fit_transform(H)
    labels, p, persist, ctree, ltree, mtree = hdbscan(H)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(algorithm='best', metric='seuclidean',
                     V=np.ones(H.shape[1])).fit(H).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

Example #28

Show file

 def _run_hdbscan(affinity: np.ndarray, min_cluster_size_for_hdbscan: int, min_cluster_size: int, max_cluster_size: int):
     assert affinity.shape[0] == affinity.shape[1]
     if affinity.shape[0] > max_cluster_size:
         allow_single_cluster = False
     else:
         allow_single_cluster = True
     db = HDBSCAN(metric='precomputed',
                  min_cluster_size=min_cluster_size_for_hdbscan,
                  min_samples=1,
                  allow_single_cluster=allow_single_cluster)
     db.fit(affinity)
     return db

Example #29

Show file

File: clustering.py Project: rajacsp/jupyter_collection

 def hdbscan(self, min_cluster_size=10, prediction_data=False):
     """ DBSCAN but allows for varying density clusters and no longer
     requires epsilon parameter, which is difficult to tune.
     http://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html
     Scales slightly worse than DBSCAN, but with a more intuitive parameter.
     """
     hdbscan = HDBSCAN(min_cluster_size=min_cluster_size,
                         prediction_data=prediction_data)
     if prediction_data:
         return hdbscan.fit(self._safe_dense(self.matrix))
     else:
         return hdbscan.fit(self.matrix)

Example #30

Show file

def test_hdbscan_allow_single_cluster_with_epsilon():
    np.random.seed(0)
    no_structure = np.random.rand(150, 2)
    # without epsilon we should see many noise points as children of root.
    labels = HDBSCAN(min_cluster_size=5,
                     cluster_selection_epsilon=0.0,
                     cluster_selection_method='eom',
                     allow_single_cluster=True).fit_predict(no_structure)
    unique_labels, counts = np.unique(labels, return_counts=True)
    assert(len(unique_labels) == 2)
    assert(counts[unique_labels == -1] == 46)

    # for this random seed an epsilon of 0.2 will produce exactly 2 noise
    # points at that cut in single linkage.
    labels = HDBSCAN(min_cluster_size=5,
                     cluster_selection_epsilon=0.2,
                     cluster_selection_method='eom',
                     allow_single_cluster=True).fit_predict(no_structure)
    unique_labels, counts = np.unique(labels, return_counts=True)
    assert(len(unique_labels) == 2)
    assert(counts[unique_labels == -1] == 2)