Example #1
0
def test_hdbscan_no_clusters():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X,
                                                      min_cluster_size=len(X) +
                                                      1)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, 0)

    labels = HDBSCAN(min_cluster_size=len(X) + 1).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, 0)
Example #2
0
def test_hdbscan_boruvka_balltree_matches():

    data = generate_noisy_data()

    labels_prims, p, persist, ctree, ltree, mtree = hdbscan(
        data, algorithm='generic')
    labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan(
        data, algorithm='boruvka_balltree')

    num_mismatches = homogeneity(labels_prims, labels_boruvka)

    assert_less(num_mismatches / float(data.shape[0]), 0.15)

    labels_prims = HDBSCAN(algorithm='generic').fit_predict(data)
    labels_boruvka = HDBSCAN(algorithm='boruvka_balltree').fit_predict(data)

    num_mismatches = homogeneity(labels_prims, labels_boruvka)

    assert_less(num_mismatches / float(data.shape[0]), 0.15)
Example #3
0
def test_hdbscan_best_balltree_metric():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X,
                                                      metric='seuclidean',
                                                      V=np.ones(X.shape[1]))
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric='seuclidean', V=np.ones(X.shape[1])).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Example #4
0
def test_condensed_tree_plot():
    clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
    if_matplotlib(clusterer.condensed_tree_.plot)(
        select_clusters=True,
        label_clusters=True,
        selection_palette=('r', 'g', 'b'),
        cmap='Reds')
    if_matplotlib(clusterer.condensed_tree_.plot)(log_size=True,
                                                  colorbar=False,
                                                  cmap='none')
Example #5
0
def test_tree_output_formats():

    clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
    clusterer.condensed_tree_.to_pandas()
    clusterer.condensed_tree_.to_networkx()
    clusterer.single_linkage_tree_.to_pandas()
    clusterer.single_linkage_tree_.to_networkx()
    clusterer.single_linkage_tree_.to_numpy()
    clusterer.minimum_spanning_tree_.to_pandas()
    clusterer.minimum_spanning_tree_.to_networkx()
Example #6
0
def test_min_span_tree_plot():
    clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
    if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap='Reds')

    H, y = make_blobs(n_samples=50, random_state=0, n_features=10)
    H = StandardScaler().fit_transform(H)

    clusterer = HDBSCAN(gen_min_span_tree=True).fit(H)
    if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap='Reds',
                                                         vary_line_width=False,
                                                         colorbar=False)

    H, y = make_blobs(n_samples=50, random_state=0, n_features=40)
    H = StandardScaler().fit_transform(H)

    clusterer = HDBSCAN(gen_min_span_tree=True).fit(H)
    if_matplotlib(clusterer.minimum_spanning_tree_.plot)(edge_cmap='Reds',
                                                         vary_line_width=False,
                                                         colorbar=False)
Example #7
0
def test_hdbscan_approximate_predict_score():
    clusterer = HDBSCAN(min_cluster_size=200).fit(X)
    # no prediction data error
    assert_raises(ValueError, approximate_predict_scores, clusterer, X)
    clusterer.generate_prediction_data()
    # wrong dimensions error
    assert_raises(ValueError, approximate_predict_scores, clusterer,
                  np.array([[1, 2, 3]]))
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        approximate_predict_scores(clusterer, np.array([[1.5, -1.0]]))
        # no clusters warning
        assert "Clusterer does not have any defined clusters" in str(
            w[-1].message)
    clusterer = HDBSCAN(prediction_data=True).fit(X)
    scores = approximate_predict_scores(clusterer, X)
    assert_array_almost_equal(scores, clusterer.outlier_scores_)
    assert scores.min() >= 0
    assert scores.max() <= 1
Example #8
0
def test_hdbscan_boruvka_kdtree_matches():

    data = generate_noisy_data()

    labels_prims, p, persist, ctree, ltree, mtree = hdbscan(
        data, algorithm="generic")
    labels_boruvka, p, persist, ctree, ltree, mtree = hdbscan(
        data, algorithm="boruvka_kdtree")

    num_mismatches = homogeneity(labels_prims, labels_boruvka)

    assert (num_mismatches / float(data.shape[0])) < 0.15

    labels_prims = HDBSCAN(algorithm="generic").fit_predict(data)
    labels_boruvka = HDBSCAN(algorithm="boruvka_kdtree").fit_predict(data)

    num_mismatches = homogeneity(labels_prims, labels_boruvka)

    assert (num_mismatches / float(data.shape[0])) < 0.15
def hdbscan_clustering(S, X, config):
    '''
    Computes H-DBSCAN clustering from an input similarity matrix.
    Returns the labels associated with the clustering.
    '''
    from hdbscan import HDBSCAN

    min_size = config.as_int("min_cluster_size")
    clf = HDBSCAN(min_cluster_size=min_size)
    return clf.fit_predict(X)
Example #10
0
def get_hotspots(**kwargs):
    '''Return the stable clusters from the condensed tree of connected components from the density graph'''
    print(' * HDBSCAN clustering data with ' +
          str(multiprocessing.cpu_count()) + ' cores...')
    config = {
        'core_dist_n_jobs': multiprocessing.cpu_count(),
        'min_cluster_size': kwargs['min_cluster_size'],
        'cluster_selection_epsilon': 0.01,
        'min_samples': 1,
        'approx_min_span_tree': False,
    }
    v = kwargs['vecs']
    z = HDBSCAN(**config).fit(v)
    # find the points in each cluster
    d = defaultdict(list)
    for idx, i in enumerate(z.labels_):
        d[i].append(v[idx])
    # find the convex hull for each cluster's points
    convex_hulls = []
    for i in d:
        hull = ConvexHull(d[i])
        points = [hull.points[j] for j in hull.vertices]
        # the last convex hull simplex needs to connect back to the first point
        convex_hulls.append(np.vstack([points, points[0]]))
    # find the centroids for each cluster
    centroids = []
    for i in d:
        x, y = np.array(d[i]).T
        centroids.append(np.array([np.mean(x), np.mean(y)]))
    # identify the number of points in each cluster
    lens = [len(d[i]) for i in d]
    # combine data into cluster objects
    closest, _ = pairwise_distances_argmin_min(centroids, v)
    paths = [kwargs['image_paths'][i] for i in closest]
    clusters = [{
        'img': clean_filename(paths[idx]),
        'convex_hull': convex_hulls[idx].tolist(),
        'n_images': lens[idx],
    } for idx, i in enumerate(closest)]
    # remove massive clusters
    retained = []
    for idx, i in enumerate(clusters):
        x, y = np.array(i['convex_hull']).T
        area = 0.5 * np.abs(
            np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
        if area < 0.2:
            retained.append(i)
    # sort the clusers by size
    clusters = sorted(retained, key=lambda i: i['n_images'], reverse=True)
    for idx, i in enumerate(clusters):
        i['label'] = 'Cluster {}'.format(idx + 1)
    # save the hotspots to disk and return the path to the saved json
    print(' * found', len(clusters), 'hotspots')
    return write_json(get_path('hotspots', 'hotspot', **kwargs), clusters,
                      **kwargs)
 def hdbscan(self, args):
     start = time.time()
     model = HDBSCAN(
         min_cluster_size=args["min_cluster_size"],
         metric=args["metric"],
         leaf_size=args["leaf_size"],
         allow_single_cluster=args["allow_single_cluster"],
     ).fit(self.data_matrix)
     labels = model.predict(self.data_matrix)
     end = time.time()
     return labels, (end - start)
Example #12
0
def test_hdbscan_membership_vector():
    clusterer = HDBSCAN(prediction_data=True).fit(X)
    vector = membership_vector(clusterer, np.array([[-1.5, -1.0]]))
    assert_array_almost_equal(vector,
                              np.array([[0.05705305, 0.05974177, 0.12228153]]))
    vector = membership_vector(clusterer, np.array([[1.5, -1.0]]))
    assert_array_almost_equal(vector,
                              np.array([[0.09462176, 0.32061556, 0.10112905]]))
    vector = membership_vector(clusterer, np.array([[0.0, 0.0]]))
    assert_array_almost_equal(vector,
                              np.array([[0.03545607, 0.03363318, 0.04643177]]))
Example #13
0
def hdbscan_cluster(params):
    print("LOAD CORPUS START")
    content = read_file(params.train_file)

    train_feature = pickle.load(open(params.feature_file, 'rb'))
    indices = list(range(len(train_feature)))

    import random
    SEED = 42
    random.seed(SEED)
    random.shuffle(indices)
    if params.sample_number >= len(train_feature):
        print("sample_number:" + str(len(train_feature)))
    indices = indices[:params.sample_number]
    content = np.array(content)[indices].tolist()
    train_feature = np.array(train_feature)[indices].tolist()

    from hdbscan import HDBSCAN
    print("HDBSCAN STARTING....")
    hdb = HDBSCAN(min_samples=1).fit(train_feature)
    data_dir, _ = os.path.split(params.train_file)
    pickle.dump(
        hdb,
        open(
            os.path.join(data_dir,
                         params.method + str(params.sample_number) + '.obj'),
            'wb'))

    sample_labels = hdb.labels_
    n_clusters_hdb_ = len(
        set(sample_labels)) - (1 if -1 in sample_labels else 0)
    print('\n\n++ HDBSCAN Results')
    print('Estimated number of clusters: %d' % n_clusters_hdb_)
    with open(params.train_file + params.method + str(params.sample_number),
              'w') as file_w:
        for idx, line in enumerate(content):
            file_w.write(str(sample_labels[idx]) + '\t' + line + '\n')

    #可能需要降维,聚类可视化
    #import matplotlib.pyplot as plt
    #hdb_unique_labels = set(hdb_labels)
    #hdb_colors = plt.cm.Spectral(np.linspace(0, 1, len(hdb_unique_labels)))
    #fig = plt.figure(figsize=plt.figaspect(0.5))
    #hdb_axis = fig.add_subplot('111')

    #for k, col in zip(hdb_unique_labels, hdb_colors):
    #    if k == -1:
    #        # Black used for noise.
    #        col = 'k'
    #        hdb_axis.plot(X[hdb_labels == k, 0], X[hdb_labels == k, 1], 'o', markerfacecolor=col,
    #                markeredgecolor='k', markersize=6)

    #hdb_axis.set_title('HDBSCAN\nEstimated number of clusters: %d' % n_clusters_hdb_)
    pass
Example #14
0
def test_hdbscan_centroids_medoids():
    centers = [(0.0, 0.0), (3.0, 3.0)]
    H, y = make_blobs(n_samples=1000, random_state=0, centers=centers, cluster_std=0.5)
    clusterer = HDBSCAN().fit(H)

    for idx, center in enumerate(centers):
        centroid = clusterer.weighted_cluster_centroid(idx)
        assert_array_almost_equal(centroid, center, decimal=1)

        medoid = clusterer.weighted_cluster_medoid(idx)
        assert_array_almost_equal(medoid, center, decimal=1)
Example #15
0
def test_hdbscan_callable_metric():
    # metric is the function reference, not the string key.
    metric = distance.euclidean

    labels = hdbscan(X, metric=metric)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric=metric).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Example #16
0
def test_condensed_tree_plot():
    clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
    if_matplotlib(clusterer.condensed_tree_.plot)(
        select_clusters=True,
        label_clusters=True,
        selection_palette=("r", "g", "b"),
        cmap="Reds",
    )
    if_matplotlib(clusterer.condensed_tree_.plot)(log_size=True,
                                                  colorbar=False,
                                                  cmap="none")
Example #17
0
 def fit(self, data, min_cluster_size, min_samples, alpha,
         cluster_selection_method):
     data = np.array(data)
     data = preprocessing.MinMaxScaler().fit_transform(data)
     model = HDBSCAN(min_cluster_size=min_cluster_size,
                     min_samples=min_samples,
                     alpha=alpha,
                     cluster_selection_method=cluster_selection_method,
                     allow_single_cluster=True)
     clustering = model.fit(data)
     return clustering
Example #18
0
def test_hdbscan_callable_metric():
    # metric is the function reference, not the string key.
    metric = distance.euclidean

    labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric=metric)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_1 == n_clusters

    labels = HDBSCAN(metric=metric).fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_2 == n_clusters
Example #19
0
def test_hdbscan_feature_vector():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_1 == n_clusters

    labels = HDBSCAN().fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert n_clusters_2 == n_clusters

    validity = validity_index(X, labels)
    assert validity >= 0.4
Example #20
0
def test_hdbscan_feature_vector():
    labels, p, persist, ctree, ltree, mtree = hdbscan(X)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN().fit(X).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

    validity = validity_index(X, labels)
    assert_greater_equal(validity, 0.4)
Example #21
0
def run_hdbscan(X, COLS, MCS, MS, DF, y):
    """
    Run hdbscan for given data and combination of Min Cluster Size, Min Samples

    Parameters
    ----------
    X: pandas.DataFrame
        The input data to be clustered

    COLS: list
        The columns from X to be considered

    MCS: int
        Min Cluster Size, input to HDBSCAN()

    MS: int
        Min Samples, input to HDBSCAN()

    DF: pandas.DataFrame
        Data for profiling

    Returns
    -------
    Dict with the profile and labels
    """
    hdb = \
    (HDBSCAN(
        min_cluster_size=MCS,
        min_samples=MS)
     .fit(X[COLS])
    )

    df = \
    (DF
     .join(y)
     .assign(clus = hdb.labels_)
     .query("clus != -1")
    )

    profile = \
    (df
     .groupby('clus')
     .mean()
     .join(Series(hdb.labels_, name='size').value_counts(normalize=True))
     .T
     .round(2)
     .loc[COLS + ['size', 'y'], :]
    )

    return {
        'profile': profile.loc[['size', 'y']].T.sort_values('y').query("y > 0.2"),
        'labels': hdb.labels_
    }
Example #22
0
def test_hdbscan_min_cluster_size():
    for min_cluster_size in range(2, len(X) + 1, 1):
        labels, p, persist, ctree, ltree, mtree = hdbscan(
            X, min_cluster_size=min_cluster_size)
        true_labels = [label for label in labels if label != -1]
        if len(true_labels) != 0:
            assert np.min(np.bincount(true_labels)) >= min_cluster_size

        labels = HDBSCAN(min_cluster_size=min_cluster_size).fit(X).labels_
        true_labels = [label for label in labels if label != -1]
        if len(true_labels) != 0:
            assert np.min(np.bincount(true_labels)) >= min_cluster_size
Example #23
0
def cluster(df, min_size=4, allow_single_cluster=True):
    """Use HDBSCAN --
    (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
    to find the best clusters for the meander.
    """
    clusterer = HDBSCAN(min_cluster_size=min_size,
                        min_samples=3,
                        metric='haversine',
                        allow_single_cluster=allow_single_cluster)
    clusterer.fit(df[['lat', 'lng']])
    df.loc[:, 'label'] = ['ABCDEFGHIJKLMN'[i] for i in clusterer.labels_]
    return df.sort_values('label').reset_index(drop=True)
Example #24
0
    def perform_hdbscan(self, min_cluster_size=15):
        hdbscan_clusterer = HDBSCAN(min_cluster_size, metric="precomputed")
        hdbscan_clusterer.fit(self.distance_matrix)
        self.hdbscan_results = {
            "parameters": hdbscan_clusterer.get_params(),
            "labels": hdbscan_clusterer.labels_,
            "probabilities": hdbscan_clusterer.probabilities_,
            "n_clusters": np.unique(hdbscan_clusterer.labels_).max() + 1,
            'clusters': label_cnt_dict(hdbscan_clusterer.labels_)
        }

        print_dict(self.hdbscan_results)
Example #25
0
    def clusterMessageTypesHDBSCAN(self, min_cluster_size = 10, min_samples = 2) \
            -> Tuple[Dict[int, List[Tuple[MessageSegment]]], numpy.ndarray, HDBSCAN]:
        clusterer = HDBSCAN(metric='precomputed',
                            allow_single_cluster=True,
                            cluster_selection_method='leaf',
                            min_cluster_size=min_cluster_size,
                            min_samples=min_samples)

        print("Messages: HDBSCAN min cluster size:", min_cluster_size,
              "min samples:", min_samples)
        segmentClusters, labels = self._postprocessClustering(clusterer)
        return segmentClusters, labels, clusterer
Example #26
0
def test_hdbscan_distance_matrix():
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)

    labels, p, persist, ctree, ltree, mtree = hdbscan(D, metric='precomputed')
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)  # ignore noise
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(metric="precomputed").fit(D).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Example #27
0
def test_hdbscan_high_dimensional():
    H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
    # H, y = shuffle(X, y, random_state=7)
    H = StandardScaler().fit_transform(H)
    labels, p, persist, ctree, ltree, mtree = hdbscan(H)
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    labels = HDBSCAN(algorithm='best', metric='seuclidean',
                     V=np.ones(H.shape[1])).fit(H).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Example #28
0
 def _run_hdbscan(affinity: np.ndarray, min_cluster_size_for_hdbscan: int, min_cluster_size: int, max_cluster_size: int):
     assert affinity.shape[0] == affinity.shape[1]
     if affinity.shape[0] > max_cluster_size:
         allow_single_cluster = False
     else:
         allow_single_cluster = True
     db = HDBSCAN(metric='precomputed',
                  min_cluster_size=min_cluster_size_for_hdbscan,
                  min_samples=1,
                  allow_single_cluster=allow_single_cluster)
     db.fit(affinity)
     return db
Example #29
0
 def hdbscan(self, min_cluster_size=10, prediction_data=False):
     """ DBSCAN but allows for varying density clusters and no longer
     requires epsilon parameter, which is difficult to tune.
     http://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html
     Scales slightly worse than DBSCAN, but with a more intuitive parameter.
     """
     hdbscan = HDBSCAN(min_cluster_size=min_cluster_size,
                         prediction_data=prediction_data)
     if prediction_data:
         return hdbscan.fit(self._safe_dense(self.matrix))
     else:
         return hdbscan.fit(self.matrix)
Example #30
0
def test_hdbscan_allow_single_cluster_with_epsilon():
    np.random.seed(0)
    no_structure = np.random.rand(150, 2)
    # without epsilon we should see many noise points as children of root.
    labels = HDBSCAN(min_cluster_size=5,
                     cluster_selection_epsilon=0.0,
                     cluster_selection_method='eom',
                     allow_single_cluster=True).fit_predict(no_structure)
    unique_labels, counts = np.unique(labels, return_counts=True)
    assert(len(unique_labels) == 2)
    assert(counts[unique_labels == -1] == 46)

    # for this random seed an epsilon of 0.2 will produce exactly 2 noise
    # points at that cut in single linkage.
    labels = HDBSCAN(min_cluster_size=5,
                     cluster_selection_epsilon=0.2,
                     cluster_selection_method='eom',
                     allow_single_cluster=True).fit_predict(no_structure)
    unique_labels, counts = np.unique(labels, return_counts=True)
    assert(len(unique_labels) == 2)
    assert(counts[unique_labels == -1] == 2)