Exemple #1
0
def test_minibatch_k_means_init(data, init):
    mb_k_means = MiniBatchKMeans(init=init,
                                 n_clusters=n_clusters,
                                 random_state=42,
                                 n_init=10)
    mb_k_means.fit(data)
    _check_fitted_model(mb_k_means)
Exemple #2
0
def test_int_input():
    X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]
    for dtype in [np.int32, np.int64]:
        X_int = np.array(X_list, dtype=dtype)
        X_int_csr = sp.csr_matrix(X_int)
        init_int = X_int[:2]

        fitted_models = [
            KMeans(n_clusters=2).fit(X_int),
            KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int),
            # mini batch kmeans is very unstable on such a small dataset hence
            # we use many inits
            MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int),
            MiniBatchKMeans(n_clusters=2, n_init=10,
                            batch_size=2).fit(X_int_csr),
            MiniBatchKMeans(n_clusters=2,
                            batch_size=2,
                            init=init_int,
                            n_init=1).fit(X_int),
            MiniBatchKMeans(n_clusters=2,
                            batch_size=2,
                            init=init_int,
                            n_init=1).fit(X_int_csr),
        ]

        for km in fitted_models:
            assert km.cluster_centers_.dtype == np.float64

        expected_labels = [0, 1, 1, 0, 0, 1]
        scores = np.array([
            v_measure_score(expected_labels, km.labels_)
            for km in fitted_models
        ])
        assert_array_almost_equal(scores, np.ones(scores.shape[0]))
Exemple #3
0
def test_predict_minibatch_dense_sparse(init):
    # check that models trained on sparse input also works for dense input at
    # predict time
    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters,
                                 init=init,
                                 n_init=10,
                                 random_state=0).fit(X_csr)

    assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)
Exemple #4
0
def test_mini_batch_k_means_random_init_partial_fit():
    km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)

    # use the partial_fit API for online learning
    for X_minibatch in np.array_split(X, 10):
        km.partial_fit(X_minibatch)

    # compute the labeling on the complete dataset
    labels = km.predict(X)
    assert v_measure_score(true_labels, labels) == 1.0
Exemple #5
0
def test_mb_kmeans_verbose():
    mb_k_means = MiniBatchKMeans(init="k-means++",
                                 n_clusters=n_clusters,
                                 random_state=42,
                                 verbose=1)
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        mb_k_means.fit(X)
    finally:
        sys.stdout = old_stdout
Exemple #6
0
def test_minibatch_sensible_reassign_partial_fit():
    zeroed_X, true_labels = make_blobs(n_samples=n_samples,
                                       centers=5,
                                       cluster_std=1.,
                                       random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random")
    for i in range(100):
        mb_k_means.partial_fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
Exemple #7
0
def test_minibatch_reassign():
    # Give a perfect initialization, but a large reassignment_ratio,
    # as a result all the centers should be reassigned and the model
    # should no longer be good
    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
    for this_X in (X, X_csr):
        mb_k_means = MiniBatchKMeans(n_clusters=n_clusters,
                                     batch_size=100,
                                     random_state=42)
        mb_k_means.fit(this_X)

        score_before = mb_k_means.score(this_X)
        try:
            old_stdout = sys.stdout
            sys.stdout = StringIO()
            # Turn on verbosity to smoke test the display code
            _mini_batch_step(this_X,
                             sample_weight, (X**2).sum(axis=1),
                             mb_k_means.cluster_centers_,
                             mb_k_means.counts_,
                             np.zeros(X.shape[1], np.double),
                             False,
                             distances=np.zeros(X.shape[0]),
                             random_reassign=True,
                             random_state=42,
                             reassignment_ratio=1,
                             verbose=True)
        finally:
            sys.stdout = old_stdout
        assert score_before > mb_k_means.score(this_X)

    # Give a perfect initialization, with a small reassignment_ratio,
    # no center should be reassigned
    for this_X in (X, X_csr):
        mb_k_means = MiniBatchKMeans(n_clusters=n_clusters,
                                     batch_size=100,
                                     init=centers.copy(),
                                     random_state=42,
                                     n_init=1)
        mb_k_means.fit(this_X)
        clusters_before = mb_k_means.cluster_centers_
        # Turn on verbosity to smoke test the display code
        _mini_batch_step(this_X,
                         sample_weight, (X**2).sum(axis=1),
                         mb_k_means.cluster_centers_,
                         mb_k_means.counts_,
                         np.zeros(X.shape[1], np.double),
                         False,
                         distances=np.zeros(X.shape[0]),
                         random_reassign=True,
                         random_state=42,
                         reassignment_ratio=1e-15)
        assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_)
Exemple #8
0
def test_minibatch_sensible_reassign_fit():
    # check if identical initial clusters are reassigned
    # also a regression test for when there are more desired reassignments than
    # samples.
    zeroed_X, true_labels = make_blobs(n_samples=100,
                                       centers=5,
                                       cluster_std=1.,
                                       random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20,
                                 batch_size=10,
                                 random_state=42,
                                 init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10

    # do the same with batch-size > X.shape[0] (regression test)
    mb_k_means = MiniBatchKMeans(n_clusters=20,
                                 batch_size=201,
                                 random_state=42,
                                 init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert mb_k_means.cluster_centers_.any(axis=1).sum() > 10
Exemple #9
0
def test_sparse_mb_k_means_callable_init():
    def test_init(X, k, random_state):
        return centers

    # Small test to check that giving the wrong number of centers
    # raises a meaningful error
    msg = "does not match the number of clusters"
    with pytest.raises(ValueError, match=msg):
        MiniBatchKMeans(init=test_init, random_state=42).fit(X_csr)

    # Now check that the fit actually works
    mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,
                                 random_state=42).fit(X_csr)
    _check_fitted_model(mb_k_means)
Exemple #10
0
def test_minibatch_default_init_size():
    mb_k_means = MiniBatchKMeans(init=centers.copy(),
                                 n_clusters=n_clusters,
                                 batch_size=10,
                                 random_state=42,
                                 n_init=1).fit(X)
    assert mb_k_means.init_size_ == 3 * mb_k_means.batch_size
    _check_fitted_model(mb_k_means)
Exemple #11
0
def test_minibatch_set_init_size():
    mb_k_means = MiniBatchKMeans(init=centers.copy(),
                                 n_clusters=n_clusters,
                                 init_size=666,
                                 random_state=42,
                                 n_init=1).fit(X)
    assert mb_k_means.init_size == 666
    assert mb_k_means.init_size_ == n_samples
    _check_fitted_model(mb_k_means)
Exemple #12
0
def test_scaled_weights():
    # scaling all sample weights by a common factor
    # shouldn't change the result
    sample_weight = np.ones(n_samples)
    for estimator in [
            KMeans(n_clusters=n_clusters, random_state=42),
            MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    ]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=0.5 * sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
Exemple #13
0
def test_unit_weights_vs_no_weights():
    # not passing any sample weights should be equivalent
    # to all weights equal to one
    sample_weight = np.ones(n_samples)
    for estimator in [
            KMeans(n_clusters=n_clusters, random_state=42),
            MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    ]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
Exemple #14
0
def test_minibatch_with_many_reassignments():
    # Test for the case that the number of clusters to reassign is bigger
    # than the batch_size
    n_samples = 550
    rnd = np.random.RandomState(42)
    X = rnd.uniform(size=(n_samples, 10))
    # Check that the fit works if n_clusters is bigger than the batch_size.
    # Run the test with 550 clusters and 550 samples, because it turned out
    # that this values ensure that the number of clusters to reassign
    # is always bigger than the batch_size
    n_clusters = 550
    MiniBatchKMeans(n_clusters=n_clusters,
                    batch_size=100,
                    init_size=n_samples,
                    random_state=42).fit(X)
Exemple #15
0
def test_weighted_vs_repeated():
    # a sample weight of N should yield the same result as an N-fold
    # repetition of the sample
    rng = np.random.RandomState(0)
    sample_weight = rng.randint(1, 5, size=n_samples)
    X_repeat = np.repeat(X, sample_weight, axis=0)
    estimators = [
        KMeans(init="k-means++", n_clusters=n_clusters, random_state=42),
        KMeans(init="random", n_clusters=n_clusters, random_state=42),
        KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42),
        MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42)
    ]
    for estimator in estimators:
        est_weighted = clone(estimator).fit(X, sample_weight=sample_weight)
        est_repeated = clone(estimator).fit(X_repeat)
        repeated_labels = np.repeat(est_weighted.labels_, sample_weight)
        assert_almost_equal(
            v_measure_score(est_repeated.labels_, repeated_labels), 1.0)
        if not isinstance(estimator, MiniBatchKMeans):
            assert_almost_equal(_sort_centers(est_weighted.cluster_centers_),
                                _sort_centers(est_repeated.cluster_centers_))
Exemple #16
0
def test_minibatch_kmeans_partial_fit_int_data():
    # Issue GH #14314
    X = np.array([[-1], [1]], dtype=np.int)
    km = MiniBatchKMeans(n_clusters=2)
    km.partial_fit(X)
    assert km.cluster_centers_.dtype.kind == "f"
Exemple #17
0
            km = factory(n_clusters=n_clusters, init=init, random_state=run_id,
                         n_init=n_init, **params).fit(X)
            inertia[i, run_id] = km.inertia_
    p = plt.errorbar(n_init_range, inertia.mean(axis=1), inertia.std(axis=1))
    plots.append(p[0])
    legends.append("%s with %s init" % (factory.__name__, init))

plt.xlabel('n_init')
plt.ylabel('inertia')
plt.legend(plots, legends)
plt.title("Mean inertia for various k-means init across %d runs" % n_runs)

# Part 2: Qualitative visual inspection of the convergence

X, y = make_data(random_state, n_samples_per_center, grid_size, scale)
km = MiniBatchKMeans(n_clusters=n_clusters, init='random', n_init=1,
                     random_state=random_state).fit(X)

plt.figure()
for k in range(n_clusters):
    my_members = km.labels_ == k
    color = cm.nipy_spectral(float(k) / n_clusters, 1)
    plt.plot(X[my_members, 0], X[my_members, 1], 'o', marker='.', c=color)
    cluster_center = km.cluster_centers_[k]
    plt.plot(cluster_center[0], cluster_center[1], 'o',
             markerfacecolor=color, markeredgecolor='k', markersize=6)
    plt.title("Example cluster allocation with a single random init\n"
              "with MiniBatchKMeans")

plt.show()
Exemple #18
0
    'alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos',
    'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
    'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian',
    'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc',
    'talk.religion.misc'
]
newsgroups = fetch_20newsgroups(categories=categories)
y_true = newsgroups.target

vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5)
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack',
                                 random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories),
                         batch_size=20000,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(newsgroups.data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time, v_measure_score(y_cocluster, y_true)))

print("MiniBatchKMeans...")
start_time = time()
y_kmeans = kmeans.fit_predict(X)
Exemple #19
0
def test_minibatch_k_means_init_multiple_runs_with_explicit_centers():
    mb_k_means = MiniBatchKMeans(init=centers.copy(),
                                 n_clusters=n_clusters,
                                 random_state=42,
                                 n_init=10)
    assert_warns(RuntimeWarning, mb_k_means.fit, X)
Exemple #20
0
def test_minibatch_tol():
    mb_k_means = MiniBatchKMeans(n_clusters=n_clusters,
                                 batch_size=10,
                                 random_state=42,
                                 tol=.01).fit(X)
    _check_fitted_model(mb_k_means)
Exemple #21
0
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)

# #############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0

# #############################################################################
# Compute clustering with MiniBatchKMeans

mbk = MiniBatchKMeans(init='k-means++',
                      n_clusters=3,
                      batch_size=batch_size,
                      n_init=10,
                      max_no_improvement=10,
                      verbose=0)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0

# #############################################################################
# Plot result

fig = plt.figure(figsize=(8, 3))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
Exemple #22
0
    ('Sparse comp. - MiniBatchSparsePCA',
     decomposition.MiniBatchSparsePCA(n_components=n_components,
                                      alpha=0.8,
                                      n_iter=100,
                                      batch_size=3,
                                      random_state=rng), True),
    ('MiniBatchDictionaryLearning',
     decomposition.MiniBatchDictionaryLearning(n_components=15,
                                               alpha=0.1,
                                               n_iter=50,
                                               batch_size=3,
                                               random_state=rng), True),
    ('Cluster centers - MiniBatchKMeans',
     MiniBatchKMeans(n_clusters=n_components,
                     tol=1e-3,
                     batch_size=20,
                     max_iter=50,
                     random_state=rng), True),
    ('Factor Analysis components - FA',
     decomposition.FactorAnalysis(n_components=n_components,
                                  max_iter=20), True),
]

# #############################################################################
# Plot a sample of the input data

plot_gallery("First centered Olivetti faces", faces_centered[:n_components])

# #############################################################################
# Do the estimation and plot it
Exemple #23
0
def test_minibatch_init_with_large_k():
    mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20)
    # Check that a warning is raised, as the number clusters is larger
    # than the init_size
    assert_warns(RuntimeWarning, mb_k_means.fit, X)
Exemple #24
0
import matplotlib.pyplot as plt
import numpy as np

from mrex import datasets
from mrex.cluster import MiniBatchKMeans
from mrex.feature_extraction.image import extract_patches_2d

faces = datasets.fetch_olivetti_faces()

# #############################################################################
# Learn the dictionary of images

print('Learning the dictionary... ')
rng = np.random.RandomState(0)
kmeans = MiniBatchKMeans(n_clusters=81, random_state=rng, verbose=True)
patch_size = (20, 20)

buffer = []
t0 = time.time()

# The online learning part: cycle over the whole dataset 6 times
index = 0
for _ in range(6):
    for img in faces.images:
        data = extract_patches_2d(img,
                                  patch_size,
                                  max_patches=50,
                                  random_state=rng)
        data = np.reshape(data, (len(data), -1))
        buffer.append(data)
Exemple #25
0
    ax = fig.add_subplot(1, 3, ind + 1)
    for this_centroid, k, col in zip(centroids, range(n_clusters), colors_):
        mask = labels == k
        ax.scatter(X[mask, 0], X[mask, 1],
                   c='w', edgecolor=col, marker='.', alpha=0.5)
        if birch_model.n_clusters is None:
            ax.scatter(this_centroid[0], this_centroid[1], marker='+',
                       c='k', s=25)
    ax.set_ylim([-25, 25])
    ax.set_xlim([-25, 25])
    ax.set_autoscaley_on(False)
    ax.set_title('Birch %s' % info)

# Compute clustering with MiniBatchKMeans.
mbk = MiniBatchKMeans(init='k-means++', n_clusters=100, batch_size=100,
                      n_init=10, max_no_improvement=10, verbose=0,
                      random_state=0)
t0 = time()
mbk.fit(X)
t_mini_batch = time() - t0
print("Time taken to run MiniBatchKMeans %0.2f seconds" % t_mini_batch)
mbk_means_labels_unique = np.unique(mbk.labels_)

ax = fig.add_subplot(1, 3, 3)
for this_centroid, k, col in zip(mbk.cluster_centers_,
                                 range(n_clusters), colors_):
    mask = mbk.labels_ == k
    ax.scatter(X[mask, 0], X[mask, 1], marker='.',
               c='w', edgecolor=col, alpha=0.5)
    ax.scatter(this_centroid[0], this_centroid[1], marker='+',
               c='k', s=25)
Exemple #26
0
    print("done in %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()

# #############################################################################
# Do the actual clustering

if opts.minibatch:
    km = MiniBatchKMeans(n_clusters=true_k,
                         init='k-means++',
                         n_init=1,
                         init_size=1000,
                         batch_size=1000,
                         verbose=opts.verbose)
else:
    km = KMeans(n_clusters=true_k,
                init='k-means++',
                max_iter=100,
                n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()