def test_minibatch_k_means_init(data, init): mb_k_means = MiniBatchKMeans(init=init, n_clusters=n_clusters, random_state=42, n_init=10) mb_k_means.fit(data) _check_fitted_model(mb_k_means)
def test_int_input(): X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]] for dtype in [np.int32, np.int64]: X_int = np.array(X_list, dtype=dtype) X_int_csr = sp.csr_matrix(X_int) init_int = X_int[:2] fitted_models = [ KMeans(n_clusters=2).fit(X_int), KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int), # mini batch kmeans is very unstable on such a small dataset hence # we use many inits MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int), MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int_csr), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int_csr), ] for km in fitted_models: assert km.cluster_centers_.dtype == np.float64 expected_labels = [0, 1, 1, 0, 0, 1] scores = np.array([ v_measure_score(expected_labels, km.labels_) for km in fitted_models ]) assert_array_almost_equal(scores, np.ones(scores.shape[0]))
def test_predict_minibatch_dense_sparse(init): # check that models trained on sparse input also works for dense input at # predict time mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init=init, n_init=10, random_state=0).fit(X_csr) assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)
def test_mini_batch_k_means_random_init_partial_fit(): km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42) # use the partial_fit API for online learning for X_minibatch in np.array_split(X, 10): km.partial_fit(X_minibatch) # compute the labeling on the complete dataset labels = km.predict(X) assert v_measure_score(true_labels, labels) == 1.0
def test_mb_kmeans_verbose(): mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, random_state=42, verbose=1) old_stdout = sys.stdout sys.stdout = StringIO() try: mb_k_means.fit(X) finally: sys.stdout = old_stdout
def test_minibatch_sensible_reassign_partial_fit(): zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5, cluster_std=1., random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random") for i in range(100): mb_k_means.partial_fit(zeroed_X) # there should not be too many exact zero cluster centers assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
def test_minibatch_reassign(): # Give a perfect initialization, but a large reassignment_ratio, # as a result all the centers should be reassigned and the model # should no longer be good sample_weight = np.ones(X.shape[0], dtype=X.dtype) for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, random_state=42) mb_k_means.fit(this_X) score_before = mb_k_means.score(this_X) try: old_stdout = sys.stdout sys.stdout = StringIO() # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, sample_weight, (X**2).sum(axis=1), mb_k_means.cluster_centers_, mb_k_means.counts_, np.zeros(X.shape[1], np.double), False, distances=np.zeros(X.shape[0]), random_reassign=True, random_state=42, reassignment_ratio=1, verbose=True) finally: sys.stdout = old_stdout assert score_before > mb_k_means.score(this_X) # Give a perfect initialization, with a small reassignment_ratio, # no center should be reassigned for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, init=centers.copy(), random_state=42, n_init=1) mb_k_means.fit(this_X) clusters_before = mb_k_means.cluster_centers_ # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, sample_weight, (X**2).sum(axis=1), mb_k_means.cluster_centers_, mb_k_means.counts_, np.zeros(X.shape[1], np.double), False, distances=np.zeros(X.shape[0]), random_reassign=True, random_state=42, reassignment_ratio=1e-15) assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_)
def test_minibatch_sensible_reassign_fit(): # check if identical initial clusters are reassigned # also a regression test for when there are more desired reassignments than # samples. zeroed_X, true_labels = make_blobs(n_samples=100, centers=5, cluster_std=1., random_state=42) zeroed_X[::2, :] = 0 mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10 # do the same with batch-size > X.shape[0] (regression test) mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201, random_state=42, init="random") mb_k_means.fit(zeroed_X) # there should not be too many exact zero cluster centers assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
def test_sparse_mb_k_means_callable_init(): def test_init(X, k, random_state): return centers # Small test to check that giving the wrong number of centers # raises a meaningful error msg = "does not match the number of clusters" with pytest.raises(ValueError, match=msg): MiniBatchKMeans(init=test_init, random_state=42).fit(X_csr) # Now check that the fit actually works mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init, random_state=42).fit(X_csr) _check_fitted_model(mb_k_means)
def test_minibatch_default_init_size(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, batch_size=10, random_state=42, n_init=1).fit(X) assert mb_k_means.init_size_ == 3 * mb_k_means.batch_size _check_fitted_model(mb_k_means)
def test_minibatch_set_init_size(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, init_size=666, random_state=42, n_init=1).fit(X) assert mb_k_means.init_size == 666 assert mb_k_means.init_size_ == n_samples _check_fitted_model(mb_k_means)
def test_scaled_weights(): # scaling all sample weights by a common factor # shouldn't change the result sample_weight = np.ones(n_samples) for estimator in [ KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42) ]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=0.5 * sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def test_unit_weights_vs_no_weights(): # not passing any sample weights should be equivalent # to all weights equal to one sample_weight = np.ones(n_samples) for estimator in [ KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42) ]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def test_minibatch_with_many_reassignments(): # Test for the case that the number of clusters to reassign is bigger # than the batch_size n_samples = 550 rnd = np.random.RandomState(42) X = rnd.uniform(size=(n_samples, 10)) # Check that the fit works if n_clusters is bigger than the batch_size. # Run the test with 550 clusters and 550 samples, because it turned out # that this values ensure that the number of clusters to reassign # is always bigger than the batch_size n_clusters = 550 MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, init_size=n_samples, random_state=42).fit(X)
def test_weighted_vs_repeated(): # a sample weight of N should yield the same result as an N-fold # repetition of the sample rng = np.random.RandomState(0) sample_weight = rng.randint(1, 5, size=n_samples) X_repeat = np.repeat(X, sample_weight, axis=0) estimators = [ KMeans(init="k-means++", n_clusters=n_clusters, random_state=42), KMeans(init="random", n_clusters=n_clusters, random_state=42), KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42) ] for estimator in estimators: est_weighted = clone(estimator).fit(X, sample_weight=sample_weight) est_repeated = clone(estimator).fit(X_repeat) repeated_labels = np.repeat(est_weighted.labels_, sample_weight) assert_almost_equal( v_measure_score(est_repeated.labels_, repeated_labels), 1.0) if not isinstance(estimator, MiniBatchKMeans): assert_almost_equal(_sort_centers(est_weighted.cluster_centers_), _sort_centers(est_repeated.cluster_centers_))
def test_minibatch_kmeans_partial_fit_int_data(): # Issue GH #14314 X = np.array([[-1], [1]], dtype=np.int) km = MiniBatchKMeans(n_clusters=2) km.partial_fit(X) assert km.cluster_centers_.dtype.kind == "f"
km = factory(n_clusters=n_clusters, init=init, random_state=run_id, n_init=n_init, **params).fit(X) inertia[i, run_id] = km.inertia_ p = plt.errorbar(n_init_range, inertia.mean(axis=1), inertia.std(axis=1)) plots.append(p[0]) legends.append("%s with %s init" % (factory.__name__, init)) plt.xlabel('n_init') plt.ylabel('inertia') plt.legend(plots, legends) plt.title("Mean inertia for various k-means init across %d runs" % n_runs) # Part 2: Qualitative visual inspection of the convergence X, y = make_data(random_state, n_samples_per_center, grid_size, scale) km = MiniBatchKMeans(n_clusters=n_clusters, init='random', n_init=1, random_state=random_state).fit(X) plt.figure() for k in range(n_clusters): my_members = km.labels_ == k color = cm.nipy_spectral(float(k) / n_clusters, 1) plt.plot(X[my_members, 0], X[my_members, 1], 'o', marker='.', c=color) cluster_center = km.cluster_centers_[k] plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=color, markeredgecolor='k', markersize=6) plt.title("Example cluster allocation with a single random init\n" "with MiniBatchKMeans") plt.show()
'alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ] newsgroups = fetch_20newsgroups(categories=categories) y_true = newsgroups.target vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5) cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(newsgroups.data) print("Coclustering...") start_time = time() cocluster.fit(X) y_cocluster = cocluster.row_labels_ print("Done in {:.2f}s. V-measure: {:.4f}".format( time() - start_time, v_measure_score(y_cocluster, y_true))) print("MiniBatchKMeans...") start_time = time() y_kmeans = kmeans.fit_predict(X)
def test_minibatch_k_means_init_multiple_runs_with_explicit_centers(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42, n_init=10) assert_warns(RuntimeWarning, mb_k_means.fit, X)
def test_minibatch_tol(): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42, tol=.01).fit(X) _check_fitted_model(mb_k_means)
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7) # ############################################################################# # Compute clustering with Means k_means = KMeans(init='k-means++', n_clusters=3, n_init=10) t0 = time.time() k_means.fit(X) t_batch = time.time() - t0 # ############################################################################# # Compute clustering with MiniBatchKMeans mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size, n_init=10, max_no_improvement=10, verbose=0) t0 = time.time() mbk.fit(X) t_mini_batch = time.time() - t0 # ############################################################################# # Plot result fig = plt.figure(figsize=(8, 3)) fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9) colors = ['#4EACC5', '#FF9C34', '#4E9A06'] # We want to have the same colors for the same cluster from the # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
('Sparse comp. - MiniBatchSparsePCA', decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8, n_iter=100, batch_size=3, random_state=rng), True), ('MiniBatchDictionaryLearning', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng), True), ('Cluster centers - MiniBatchKMeans', MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, max_iter=50, random_state=rng), True), ('Factor Analysis components - FA', decomposition.FactorAnalysis(n_components=n_components, max_iter=20), True), ] # ############################################################################# # Plot a sample of the input data plot_gallery("First centered Olivetti faces", faces_centered[:n_components]) # ############################################################################# # Do the estimation and plot it
def test_minibatch_init_with_large_k(): mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20) # Check that a warning is raised, as the number clusters is larger # than the init_size assert_warns(RuntimeWarning, mb_k_means.fit, X)
import matplotlib.pyplot as plt import numpy as np from mrex import datasets from mrex.cluster import MiniBatchKMeans from mrex.feature_extraction.image import extract_patches_2d faces = datasets.fetch_olivetti_faces() # ############################################################################# # Learn the dictionary of images print('Learning the dictionary... ') rng = np.random.RandomState(0) kmeans = MiniBatchKMeans(n_clusters=81, random_state=rng, verbose=True) patch_size = (20, 20) buffer = [] t0 = time.time() # The online learning part: cycle over the whole dataset 6 times index = 0 for _ in range(6): for img in faces.images: data = extract_patches_2d(img, patch_size, max_patches=50, random_state=rng) data = np.reshape(data, (len(data), -1)) buffer.append(data)
ax = fig.add_subplot(1, 3, ind + 1) for this_centroid, k, col in zip(centroids, range(n_clusters), colors_): mask = labels == k ax.scatter(X[mask, 0], X[mask, 1], c='w', edgecolor=col, marker='.', alpha=0.5) if birch_model.n_clusters is None: ax.scatter(this_centroid[0], this_centroid[1], marker='+', c='k', s=25) ax.set_ylim([-25, 25]) ax.set_xlim([-25, 25]) ax.set_autoscaley_on(False) ax.set_title('Birch %s' % info) # Compute clustering with MiniBatchKMeans. mbk = MiniBatchKMeans(init='k-means++', n_clusters=100, batch_size=100, n_init=10, max_no_improvement=10, verbose=0, random_state=0) t0 = time() mbk.fit(X) t_mini_batch = time() - t0 print("Time taken to run MiniBatchKMeans %0.2f seconds" % t_mini_batch) mbk_means_labels_unique = np.unique(mbk.labels_) ax = fig.add_subplot(1, 3, 3) for this_centroid, k, col in zip(mbk.cluster_centers_, range(n_clusters), colors_): mask = mbk.labels_ == k ax.scatter(X[mask, 0], X[mask, 1], marker='.', c='w', edgecolor=col, alpha=0.5) ax.scatter(this_centroid[0], this_centroid[1], marker='+', c='k', s=25)
print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() # ############################################################################# # Do the actual clustering if opts.minibatch: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=opts.verbose) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print()