def test_minibatch_sensible_reassign_fit(): # check if identical initial clusters are reassigned # also a regression test for when there are more desired reassignments than # samples. zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, cluster_std=1., random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10 # do the same with batch-size > X.shape[0] (regression test) mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
def test_spectral_embedding_amg_solver_failure(seed=36): # Test spectral embedding with amg solver failure, see issue #13393 pytest.importorskip('pyamg') # The generated graph below is NOT fully connected if n_neighbors=3 n_samples = 200 n_clusters = 3 n_features = 3 centers = np.eye(n_clusters, n_features) S, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) se_amg0 = SpectralEmbedding(n_components=3, affinity="nearest_neighbors", eigen_solver="amg", n_neighbors=3, random_state=np.random.RandomState(seed)) embed_amg0 = se_amg0.fit_transform(S) for i in range(10): se_amg0.set_params(random_state=np.random.RandomState(seed + 1)) embed_amg1 = se_amg0.fit_transform(S) assert _check_with_col_sign_flipping(embed_amg0, embed_amg1, 0.05)
def test_bin_seeds(): # Test the bin seeding technique which can be used in the mean shift # algorithm # Data is just 6 points in the plane X = np.array([[1., 1.], [1.4, 1.4], [1.8, 1.2], [2., 1.], [2.1, 1.1], [0., 0.]]) # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be # found ground_truth = {(1., 1.), (2., 1.), (0., 0.)} test_bins = get_bin_seeds(X, 1, 1) test_result = set(tuple(p) for p in test_bins) assert len(ground_truth.symmetric_difference(test_result)) == 0 # With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be # found ground_truth = {(1., 1.), (2., 1.)} test_bins = get_bin_seeds(X, 1, 2) test_result = set(tuple(p) for p in test_bins) assert len(ground_truth.symmetric_difference(test_result)) == 0 # With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found # we bail and use the whole data here. with warnings.catch_warnings(record=True): test_bins = get_bin_seeds(X, 0.01, 1) assert_array_almost_equal(test_bins, X) # tight clusters around [0, 0] and [1, 1], only get two bins X, _ = make_blobs(n_samples=100, n_features=2, centers=[[0, 0], [1, 1]], cluster_std=0.1, random_state=0) test_bins = get_bin_seeds(X, 1) assert_array_equal(test_bins, [[0, 0], [1, 1]])
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol): # check that fit.predict gives same result as fit_predict # There's a very small chance of failure with elkan on unstructured dataset # because predict method uses fast euclidean distances computation which # may cause small numerical instabilities. # NB: This test is largely redundant with respect to test_predict and # test_predict_equal_labels. This test has the added effect of # testing idempotence of the fittng procesdure which appears to # be where it fails on some MacOS setups. if sys.platform == "darwin": pytest.xfail( "Known failures on MacOS, See " "https://github.com/scikit-learn/scikit-learn/issues/12644") if not (algo == 'elkan' and constructor is sp.csr_matrix): rng = np.random.RandomState(seed) X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[0].astype(dtype, copy=False) X = constructor(X) kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed, tol=tol, max_iter=max_iter, n_jobs=1) labels_1 = kmeans.fit(X).predict(X) labels_2 = kmeans.fit_predict(X) assert_array_equal(labels_1, labels_2)
def test_bad_reachability(): msg = "All reachability values are inf. Set a larger max_eps." centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) with pytest.warns(UserWarning, match=msg): clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015) clust.fit(X)
def test_minibatch_sensible_reassign_partial_fit(): zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5, cluster_std=1., random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random") for i in range(100): mb_k_means.partial_fit(zeroed_X) # there should not be too many exact zero cluster centers assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
def test_close_extract(): # Test extract where extraction eps is close to scaled max_eps centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) # Compute OPTICS clust = OPTICS(max_eps=1.0, cluster_method='dbscan', eps=0.3, min_samples=10).fit(X) # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters assert max(clust.labels_) == 2
def test_bad_extract(): # Test an extraction of eps too close to original eps msg = "Specify an epsilon smaller than 0.15. Got 0.3." centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) # Compute OPTICS clust = OPTICS(max_eps=5.0 * 0.03, cluster_method='dbscan', eps=0.3, min_samples=10) assert_raise_message(ValueError, msg, clust.fit, X)
def test_parallel(): centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 X, _ = make_blobs(n_samples=50, n_features=2, centers=centers, cluster_std=0.4, shuffle=True, random_state=11) ms1 = MeanShift(n_jobs=2) ms1.fit(X) ms2 = MeanShift() ms2.fit(X) assert_array_almost_equal(ms1.cluster_centers_, ms2.cluster_centers_) assert_array_equal(ms1.labels_, ms2.labels_)
def test_elkan_results(distribution): # check that results are identical between lloyd and elkan algorithms rnd = np.random.RandomState(0) if distribution == 'normal': X = rnd.normal(size=(50, 10)) else: X, _ = make_blobs(random_state=rnd) km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1) km_elkan = KMeans(algorithm='elkan', n_clusters=5, random_state=0, n_init=1) km_full.fit(X) km_elkan.fit(X) assert_array_almost_equal(km_elkan.cluster_centers_, km_full.cluster_centers_) assert_array_equal(km_elkan.labels_, km_full.labels_)
def test_dbscan_optics_parity(eps, min_samples): # Test that OPTICS clustering labels are <= 5% difference of DBSCAN centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) # calculate optics with dbscan extract at 0.3 epsilon op = OPTICS(min_samples=min_samples, cluster_method='dbscan', eps=eps).fit(X) # calculate dbscan labels db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) contingency = contingency_matrix(db.labels_, op.labels_) agree = min(np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))) disagree = X.shape[0] - agree percent_mismatch = np.round((disagree - 1) / X.shape[0], 2) # verify label mismatch is <= 5% labels assert percent_mismatch <= 0.05
from scipy import sparse from mrex.utils.testing import assert_array_equal from mrex.utils.testing import assert_array_almost_equal from mrex.utils.testing import assert_raise_message from mrex.cluster import MeanShift from mrex.cluster import mean_shift from mrex.cluster import estimate_bandwidth from mrex.cluster import get_bin_seeds from mrex.datasets.samples_generator import make_blobs n_clusters = 3 centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 X, _ = make_blobs(n_samples=300, n_features=2, centers=centers, cluster_std=0.4, shuffle=True, random_state=11) def test_estimate_bandwidth(): # Test estimate_bandwidth bandwidth = estimate_bandwidth(X, n_samples=200) assert 0.9 <= bandwidth <= 1.5 def test_estimate_bandwidth_1sample(): # Test estimate_bandwidth when n_samples=1 and quantile<1, so that # n_neighbors is set to 1. bandwidth = estimate_bandwidth(X, n_samples=1, quantile=0.3) assert bandwidth == pytest.approx(0., abs=1e-5)
from mrex.cluster.k_means_ import _labels_inertia from mrex.cluster.k_means_ import _mini_batch_step from mrex.datasets.samples_generator import make_blobs from io import StringIO from mrex.metrics.cluster import homogeneity_score # non centered, sparse centers to check the centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) n_samples = 100 n_clusters, n_features = centers.shape X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) X_csr = sp.csr_matrix(X) @pytest.mark.parametrize("representation, algo", [('dense', 'full'), ('dense', 'elkan'), ('sparse', 'full')]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_kmeans_results(representation, algo, dtype): # cheks that kmeans works as intended array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation] X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) sample_weight = [3, 1, 1, 3] # will be rescaled to [1.5, 0.5, 0.5, 1.5] init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)
Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward feature space analysis". IEEE Transactions on Pattern Analysis and Machine Intelligence. 2002. pp. 603-619. """ print(__doc__) import numpy as np from mrex.cluster import MeanShift, estimate_bandwidth from mrex.datasets.samples_generator import make_blobs # ############################################################################# # Generate sample data centers = [[1, 1], [-1, -1], [1, -1]] X, _ = make_blobs(n_samples=10000, centers=centers, cluster_std=0.6) # ############################################################################# # Compute clustering with MeanShift # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique)
import numpy as np import matplotlib.pyplot as plt from mrex.cluster import MiniBatchKMeans, KMeans from mrex.metrics.pairwise import pairwise_distances_argmin from mrex.datasets.samples_generator import make_blobs # ############################################################################# # Generate sample data np.random.seed(0) batch_size = 45 centers = [[1, 1], [-1, -1], [1, -1]] n_clusters = len(centers) X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7) # ############################################################################# # Compute clustering with Means k_means = KMeans(init='k-means++', n_clusters=3, n_init=10) t0 = time.time() k_means.fit(X) t_batch = time.time() - t0 # ############################################################################# # Compute clustering with MiniBatchKMeans mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size,
import matplotlib.pyplot as plt import matplotlib.colors as colors from mrex.cluster import Birch, MiniBatchKMeans from mrex.datasets.samples_generator import make_blobs # Generate centers for the blobs so that it forms a 10 X 10 grid. xx = np.linspace(-22, 22, 10) yy = np.linspace(-22, 22, 10) xx, yy = np.meshgrid(xx, yy) n_centres = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis])) # Generate blobs to do a comparison between MiniBatchKMeans and Birch. X, y = make_blobs(n_samples=100000, centers=n_centres, random_state=0) # Use all colors that matplotlib provides by default. colors_ = cycle(colors.cnames.keys()) fig = plt.figure(figsize=(12, 4)) fig.subplots_adjust(left=0.04, right=0.98, bottom=0.1, top=0.9) # Compute clustering with Birch with and without the final clustering step # and plot. birch_models = [Birch(threshold=1.7, n_clusters=None), Birch(threshold=1.7, n_clusters=100)] final_step = ['without global clustering', 'with global clustering'] for ind, (birch_model, info) in enumerate(zip(birch_models, final_step)): t = time()
Reference: Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages Between Data Points", Science Feb. 2007 """ print(__doc__) from mrex.cluster import AffinityPropagation from mrex import metrics from mrex.datasets.samples_generator import make_blobs # ############################################################################# # Generate sample data centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5, random_state=0) # ############################################################################# # Compute Affinity Propagation af = AffinityPropagation(preference=-50).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" %