Ejemplo n.º 1
0
def test_correct_number_of_clusters():
    # in 'auto' mode

    n_clusters = 3
    X = generate_clustered_data(n_clusters=n_clusters)
    # Parameters chosen specifically for this task.
    # Compute OPTICS
    clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=.1)
    clust.fit(X)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
    assert n_clusters_1 == n_clusters

    # check attribute types and sizes
    assert clust.labels_.shape == (len(X),)
    assert clust.labels_.dtype.kind == 'i'

    assert clust.reachability_.shape == (len(X),)
    assert clust.reachability_.dtype.kind == 'f'

    assert clust.core_distances_.shape == (len(X),)
    assert clust.core_distances_.dtype.kind == 'f'

    assert clust.ordering_.shape == (len(X),)
    assert clust.ordering_.dtype.kind == 'i'
    assert set(clust.ordering_) == set(range(len(X)))
Ejemplo n.º 2
0
def test_correct_number_of_clusters():
    # in 'auto' mode

    n_clusters = 3
    X = generate_clustered_data(n_clusters=n_clusters)
    # Parameters chosen specifically for this task.
    # Compute OPTICS
    clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4)
    clust.fit(X)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
    assert_equal(n_clusters_1, n_clusters)

    # check attribute types and sizes
    assert clust.core_sample_indices_.ndim == 1
    assert clust.core_sample_indices_.size > 0
    assert clust.core_sample_indices_.dtype.kind == 'i'

    assert clust.labels_.shape == (len(X),)
    assert clust.labels_.dtype.kind == 'i'

    assert clust.reachability_.shape == (len(X),)
    assert clust.reachability_.dtype.kind == 'f'

    assert clust.core_distances_.shape == (len(X),)
    assert clust.core_distances_.dtype.kind == 'f'

    assert clust.ordering_.shape == (len(X),)
    assert clust.ordering_.dtype.kind == 'i'
    assert set(clust.ordering_) == set(range(len(X)))
Ejemplo n.º 3
0
def test_correct_number_of_clusters():
    # in 'auto' mode

    n_clusters = 3
    X = generate_clustered_data(n_clusters=n_clusters)
    # Parameters chosen specifically for this task.
    # Compute OPTICS
    clust = OPTICS(max_bound=5.0 * 6.0, min_samples=4, metric='euclidean')
    clust.fit(X)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
    assert_equal(n_clusters_1, n_clusters)
Ejemplo n.º 4
0
def test_correct_number_of_clusters():
    # in 'auto' mode

    n_clusters = 3
    X = generate_clustered_data(n_clusters=n_clusters)
    # Parameters chosen specifically for this task.
    # Compute OPTICS
    clust = OPTICS(max_bound=5.0 * 6.0, min_samples=4, metric='euclidean')
    clust.fit(X)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
    assert_equal(n_clusters_1, n_clusters)
Ejemplo n.º 5
0
def test_optics():
    # Tests the optics clustering method and all functions inside it
    # 'auto' mode

    n_clusters = 3
    X = generate_clustered_data(n_clusters=n_clusters)
    print(np.shape(X))
    # Parameters chosen specifically for this task.
    # Compute OPTICS
    clust = OPTICS(max_bound=5.0 * 6.0, min_samples=4, metric='euclidean')
    clust.fit(X)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
    assert_equal(n_clusters_1, n_clusters)
Ejemplo n.º 6
0
def test_dbscan():
    # Test chaining RadiusNeighborsTransformer and DBSCAN
    radius = 0.3
    n_clusters = 3
    X = generate_clustered_data(n_clusters=n_clusters)

    # compare the chained version and the compact version
    est_chain = make_pipeline(
        RadiusNeighborsTransformer(radius=radius, mode='distance'),
        DBSCAN(metric='precomputed', eps=radius))
    est_compact = DBSCAN(eps=radius)

    labels_chain = est_chain.fit_predict(X)
    labels_compact = est_compact.fit_predict(X)
    assert_array_almost_equal(labels_chain, labels_compact)
Ejemplo n.º 7
0
def test_birch_predict():
    # Test the predict method predicts the nearest centroid.
    rng = np.random.RandomState(0)
    X = generate_clustered_data(n_clusters=3, n_features=3,
                                n_samples_per_cluster=10)

    # n_samples * n_samples_per_cluster
    shuffle_indices = np.arange(30)
    rng.shuffle(shuffle_indices)
    X_shuffle = X[shuffle_indices, :]
    brc = Birch(n_clusters=4, threshold=1.)
    brc.fit(X_shuffle)
    centroids = brc.subcluster_centers_
    assert_array_equal(brc.labels_, brc.predict(X_shuffle))
    nearest_centroid = pairwise_distances_argmin(X_shuffle, centroids)
    assert_almost_equal(v_measure_score(nearest_centroid, brc.labels_), 1.0)
Ejemplo n.º 8
0
def test_birch_predict():
    # Test the predict method predicts the nearest centroid.
    rng = np.random.RandomState(0)
    X = generate_clustered_data(n_clusters=3, n_features=3,
                                n_samples_per_cluster=10)

    # n_samples * n_samples_per_cluster
    shuffle_indices = np.arange(30)
    rng.shuffle(shuffle_indices)
    X_shuffle = X[shuffle_indices, :]
    brc = Birch(n_clusters=4, threshold=1.)
    brc.fit(X_shuffle)
    centroids = brc.subcluster_centers_
    assert_array_equal(brc.labels_, brc.predict(X_shuffle))
    nearest_centroid = pairwise_distances_argmin(X_shuffle, centroids)
    assert_almost_equal(v_measure_score(nearest_centroid, brc.labels_), 1.0)
Ejemplo n.º 9
0
from scipy.spatial import distance
from scipy import sparse

from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_in
from sklearn.utils.testing import assert_not_in
from sklearn.cluster.dbscan_ import DBSCAN
from sklearn.cluster.dbscan_ import dbscan
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.metrics.pairwise import pairwise_distances


n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)


def test_dbscan_similarity():
    # Tests the DBSCAN algorithm with a similarity array.
    # Parameters chosen specifically for this task.
    eps = 0.15
    min_samples = 10
    # Compute similarities
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)
    # Compute DBSCAN
    core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
                                  min_samples=min_samples)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
Ejemplo n.º 10
0
from scipy import sparse
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_in
from sklearn.utils.testing import assert_not_in
from sklearn.utils.testing import assert_no_warnings
from sklearn.utils.testing import if_matplotlib
from hdbscan import RobustSingleLinkage
from hdbscan import robust_single_linkage
from sklearn.cluster.tests.common import generate_clustered_data

from sklearn import datasets

n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50)

def test_rsl_distance_matrix():
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)

    labels, tree = robust_single_linkage(D, 0.25, metric='precomputed')
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise
    #assert_equal(n_clusters_1, n_clusters)

    labels = RobustSingleLinkage(metric="precomputed").fit(D).labels_
    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    #assert_equal(n_clusters_2, n_clusters)

def test_rsl_feature_vector():
def clustered_data(n_clusters):
    return generate_clustered_data(n_clusters=n_clusters)
Ejemplo n.º 12
0
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_in
from sklearn.utils.testing import assert_not_in
from sklearn.utils.testing import assert_no_warnings
from sklearn.utils.testing import if_matplotlib
from hdbscan import HDBSCAN
from hdbscan import hdbscan
from sklearn.cluster.tests.common import generate_clustered_data
from scipy.stats import mode

from tempfile import mkdtemp

from sklearn import datasets

n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50)


def relabel(labels):
    result = np.zeros(labels.shape[0])
    labels_to_go = set(labels)
    i = 0
    new_l = 0
    while len(labels_to_go) > 0:
        l = labels[i]
        if l in labels_to_go:
            result[labels == l] = new_l
            new_l += 1
            labels_to_go.remove(l)
        i += 1
    return result