Esempio n. 1
0
    def testElkanResultsSparse(self):
        for distribution in ['normal', 'blobs']:
            # check that results are identical between lloyd and elkan algorithms
            # with sparse input
            rnd = np.random.RandomState(0)
            if distribution == 'normal':
                X = sp.random(100,
                              100,
                              density=0.1,
                              format='csr',
                              random_state=rnd)
                X.data = rnd.randn(len(X.data))
            else:
                X, _ = make_blobs(n_samples=100,
                                  n_features=100,
                                  random_state=rnd)
                X = sp.csr_matrix(X)

            km_full = KMeans(algorithm='full',
                             n_clusters=5,
                             random_state=0,
                             n_init=1,
                             init='k-means++')
            km_elkan = KMeans(algorithm='elkan',
                              n_clusters=5,
                              random_state=0,
                              n_init=1,
                              init='k-means++')

            km_full.fit(X)
            km_elkan.fit(X)
            np.testing.assert_allclose(km_elkan.cluster_centers_,
                                       km_full.cluster_centers_)
            np.testing.assert_allclose(km_elkan.labels_, km_full.labels_)
Esempio n. 2
0
    def testRelocatedClusters(self):
        # check that empty clusters are relocated as expected

        # second center too far from others points will be empty at first iter
        init_centers = np.array([[0.5, 0.5], [3, 3]])

        expected_labels = [0, 0, 1, 1]
        expected_inertia = 0.25
        expected_centers = [[0.25, 0], [0.75, 1]]
        expected_n_iter = 3

        representations = ['dense', 'sparse']
        algos = ['full', 'elkan']

        for representation in representations:
            array_constr = {
                'dense': np.array,
                'sparse': sp.csr_matrix
            }[representation]
            X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]])

            for algo in algos:
                kmeans = KMeans(n_clusters=2,
                                n_init=1,
                                init=init_centers,
                                algorithm=algo)
                kmeans.fit(X)

                np.testing.assert_array_equal(kmeans.labels_, expected_labels)
                np.testing.assert_almost_equal(kmeans.inertia_,
                                               expected_inertia)
                np.testing.assert_array_almost_equal(kmeans.cluster_centers_,
                                                     expected_centers)
                self.assertEqual(kmeans.n_iter_, expected_n_iter)
Esempio n. 3
0
def test_elkan_results(setup, distribution, tol):
    # check that results are identical between lloyd and elkan algorithms

    rnd = np.random.RandomState(0)
    if distribution == 'normal':
        X = rnd.normal(size=(5000, 10))
    else:
        X, _ = make_blobs(random_state=rnd)

    km_full = KMeans(algorithm='full',
                     n_clusters=5,
                     random_state=0,
                     n_init=1,
                     tol=tol,
                     init='k-means++')
    km_elkan = KMeans(algorithm='elkan',
                      n_clusters=5,
                      random_state=0,
                      n_init=1,
                      tol=tol,
                      init='k-means++')

    km_full.fit(X)
    km_elkan.fit(X)
    np.testing.assert_allclose(km_elkan.cluster_centers_,
                               km_full.cluster_centers_)
    np.testing.assert_array_equal(km_elkan.labels_, km_full.labels_)

    assert km_elkan.n_iter_ == km_full.n_iter_
    assert km_elkan.inertia_ == pytest.approx(km_full.inertia_, rel=1e-6)
Esempio n. 4
0
    def testConsistentResultWithSklearn(self):
        rnd = np.random.RandomState(0)
        X, _ = make_blobs(random_state=rnd)
        raw = X
        X = mt.tensor(X, chunk_size=50)

        km_elkan = KMeans(algorithm='elkan',
                          n_clusters=5,
                          random_state=0,
                          n_init=1,
                          tol=1e-4,
                          init='k-means++')
        sk_km_elkan = SK_KMeans(algorithm='elkan',
                                n_clusters=5,
                                random_state=0,
                                n_init=1,
                                tol=1e-4,
                                init='k-means++')

        km_elkan.fit(X)
        sk_km_elkan.fit(raw)

        np.testing.assert_allclose(km_elkan.cluster_centers_,
                                   sk_km_elkan.cluster_centers_)
        np.testing.assert_array_equal(km_elkan.labels_, sk_km_elkan.labels_)

        self.assertEqual(km_elkan.n_iter_, sk_km_elkan.n_iter_)
Esempio n. 5
0
    def testScore(self):
        centers = np.array([
            [0.0, 5.0, 0.0, 0.0, 0.0],
            [1.0, 1.0, 4.0, 0.0, 0.0],
            [1.0, 0.0, 0.0, 5.0, 1.0],
        ])
        n_samples = 100
        n_clusters, n_features = centers.shape
        X = make_blobs(n_samples=n_samples,
                       centers=centers,
                       cluster_std=1.,
                       random_state=42)[0]

        for algo in ['full', 'elkan']:
            # Check that fitting k-means with multiple inits gives better score
            km1 = KMeans(n_clusters=n_clusters,
                         max_iter=1,
                         random_state=42,
                         n_init=1,
                         algorithm=algo,
                         init='k-means++')
            s1 = km1.fit(X).score(X).fetch()
            km2 = KMeans(n_clusters=n_clusters,
                         max_iter=10,
                         random_state=42,
                         n_init=1,
                         algorithm=algo,
                         init='k-means++')
            s2 = km2.fit(X).score(X).fetch()
            self.assertGreater(s2, s1)
Esempio n. 6
0
    def testKMeansInit(self):
        # non centered, sparse centers to check the
        centers = np.array([
            [0.0, 5.0, 0.0, 0.0, 0.0],
            [1.0, 1.0, 4.0, 0.0, 0.0],
            [1.0, 0.0, 0.0, 5.0, 1.0],
        ])
        n_samples = 100
        n_clusters, n_features = centers.shape
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    cluster_std=1.,
                                    random_state=42)
        X_csr = sp.csr_matrix(X)
        for data in [X, X_csr]:
            for init in ['random', 'k-means++', 'k-means||', centers.copy()]:
                data = mt.tensor(data, chunk_size=50)
                km = KMeans(init=init,
                            n_clusters=n_clusters,
                            random_state=42,
                            n_init=1,
                            algorithm='elkan')
                km.fit(data)
                self._check_fitted_model(km, n_clusters, n_features,
                                         true_labels)

        X = mt.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])
        kmeans = KMeans(n_clusters=2,
                        random_state=0,
                        n_init=1,
                        init='k-means||').fit(X)
        self.assertEqual(sorted(kmeans.cluster_centers_.fetch().tolist()),
                         sorted([[10., 2.], [1., 2.]]))
Esempio n. 7
0
 def testKMeansFortranAlignedData(self):
     # Check the KMeans will work well, even if X is a fortran-aligned data.
     X = np.asfortranarray([[0, 0], [0, 1], [0, 1]])
     centers = np.array([[0, 0], [0, 1]])
     labels = np.array([0, 1, 1])
     km = KMeans(n_init=1, init=centers, random_state=42, n_clusters=2)
     km.fit(X)
     np.testing.assert_array_almost_equal(km.cluster_centers_, centers)
     np.testing.assert_array_equal(km.labels_, labels)
Esempio n. 8
0
def test_k_means_fortran_aligned_data(setup):
    # Check the KMeans will work well, even if X is a fortran-aligned data.
    X = np.asfortranarray([[0, 0], [0, 1], [0, 1]])
    centers = np.array([[0, 0], [0, 1]])
    labels = np.array([0, 1, 1])
    km = KMeans(n_init=1,
                init=centers,
                random_state=42,
                n_clusters=2,
                algorithm='elkan')
    km.fit(X)
    np.testing.assert_array_almost_equal(km.cluster_centers_, centers)
    np.testing.assert_array_equal(km.labels_, labels)
Esempio n. 9
0
    def testKMeansExplicitInitShape(self):
        # test for sensible errors when giving explicit init
        # with wrong number of features or clusters
        rnd = np.random.RandomState(0)
        X = rnd.normal(size=(40, 3))

        # mismatch of number of features
        km = KMeans(n_init=1,
                    init=X[:, :2],
                    n_clusters=len(X),
                    algorithm='elkan')
        msg = "does not match the number of features of the data"
        with pytest.raises(ValueError, match=msg):
            km.fit(X)
        # for callable init
        km = KMeans(n_init=1,
                    init=lambda X_, k, random_state: X_[:, :2],
                    n_clusters=len(X),
                    algorithm='elkan')
        with pytest.raises(ValueError, match=msg):
            km.fit(X)
        # mismatch of number of clusters
        msg = "does not match the number of clusters"
        km = KMeans(n_init=1, init=X[:2, :], n_clusters=3, algorithm='elkan')
        with pytest.raises(ValueError, match=msg):
            km.fit(X)
        # for callable init
        km = KMeans(n_init=1,
                    init=lambda X_, k, random_state: X_[:2, :],
                    n_clusters=3,
                    algorithm='elkan')
        with pytest.raises(ValueError, match=msg):
            km.fit(X)
Esempio n. 10
0
    def testDistributedKMeans(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1

        with new_session(service_ep) as sess:
            run_kwargs = {'timeout': timeout}

            rnd = np.random.RandomState(0)
            X, _ = make_blobs(random_state=rnd)
            raw = X
            X = mt.tensor(X, chunk_size=50)

            km_elkan = KMeans(algorithm='elkan',
                              n_clusters=5,
                              random_state=0,
                              n_init=1,
                              tol=1e-4,
                              init='k-means++')
            sk_km_elkan = SK_KMEANS(algorithm='elkan',
                                    n_clusters=5,
                                    random_state=0,
                                    n_init=1,
                                    tol=1e-4,
                                    init='k-means++')

            km_elkan.fit(X, session=sess, run_kwargs=run_kwargs)
            sk_km_elkan.fit(raw)

            np.testing.assert_allclose(km_elkan.cluster_centers_,
                                       sk_km_elkan.cluster_centers_)
            np.testing.assert_array_equal(km_elkan.labels_,
                                          sk_km_elkan.labels_)

            self.assertEqual(km_elkan.n_iter_, sk_km_elkan.n_iter_)

        with new_session(service_ep) as sess2:
            run_kwargs = {'timeout': timeout}

            rnd = np.random.RandomState(0)
            X, _ = make_blobs(random_state=rnd)
            X = mt.tensor(X, chunk_size=50)

            kmeans = KMeans(n_clusters=5,
                            random_state=0,
                            n_init=1,
                            tol=1e-4,
                            init='k-means||')
            kmeans.fit(X, session=sess2, run_kwargs=run_kwargs)
Esempio n. 11
0
    def testKMeansFitPredict(self):
        # check that fit.predict gives same result as fit_predict
        algos = ['full', 'elkan']
        seed_max_iter_tols = [
            (0, 2, 1e-7),  # strict non-convergence
            (1, 2, 1e-1),  # loose non-convergence
            (3, 300, 1e-7),  # strict convergence
            (4, 300, 1e-1),  # loose convergence
        ]

        for algo in algos:
            for seed, max_iter, tol in seed_max_iter_tols:
                rng = np.random.RandomState(seed)

                X = make_blobs(n_samples=1000,
                               n_features=10,
                               centers=10,
                               random_state=rng)[0]

                kmeans = KMeans(algorithm=algo,
                                n_clusters=10,
                                random_state=seed,
                                tol=tol,
                                max_iter=max_iter,
                                init='k-means++')

                labels_1 = kmeans.fit(X).predict(X)
                labels_2 = kmeans.fit_predict(X)

                # Due to randomness in the order in which chunks of data are processed when
                # using more than one thread, the absolute values of the labels can be
                # different between the 2 strategies but they should correspond to the same
                # clustering.
                self.assertAlmostEqual(v_measure_score(labels_1, labels_2), 1)
Esempio n. 12
0
def test_k_means_results(setup, representation, dtype, algo):
    array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation]

    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
    sample_weight = [3, 1, 1, 3]  # will be rescaled to [1.5, 0.5, 0.5, 1.5]
    init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)

    expected_labels = [0, 0, 1, 1]
    expected_inertia = 0.1875
    expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
    expected_n_iter = 2

    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
    kmeans.fit(X, sample_weight=sample_weight)

    np.testing.assert_array_equal(kmeans.labels_, expected_labels)
    np.testing.assert_almost_equal(kmeans.inertia_, expected_inertia)
    np.testing.assert_array_almost_equal(kmeans.cluster_centers_,
                                         expected_centers)
    assert kmeans.n_iter_ == expected_n_iter
Esempio n. 13
0
def test_k_means_new_centers(setup):
    # Explore the part of the code where a new center is reassigned
    X = np.array([[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0],
                  [0, 0, 0, 0], [0, 1, 0, 0]])
    labels = [0, 1, 2, 1, 1, 2]
    bad_centers = np.array([[+0, 1, 0, 0], [.2, 0, .2, .2], [+0, 0, 0, 0]])

    km = KMeans(n_clusters=3,
                init=bad_centers,
                n_init=1,
                max_iter=10,
                random_state=1,
                algorithm='elkan')
    for this_X in (X, sp.coo_matrix(X)):
        km.fit(this_X)
        this_labels = km.labels_.fetch()
        # Reorder the labels so that the first instance is in cluster 0,
        # the second in cluster 1, ...
        this_labels = np.unique(this_labels, return_index=True)[1][this_labels]
        np.testing.assert_array_equal(this_labels, labels)
Esempio n. 14
0
    def testTransform(self):
        centers = np.array([
            [0.0, 5.0, 0.0, 0.0, 0.0],
            [1.0, 1.0, 4.0, 0.0, 0.0],
            [1.0, 0.0, 0.0, 5.0, 1.0],
        ])
        n_samples = 100
        n_clusters, n_features = centers.shape
        X = make_blobs(n_samples=n_samples,
                       centers=centers,
                       cluster_std=1.,
                       random_state=42)[0]

        km = KMeans(n_clusters=n_clusters, init='k-means++', algorithm='elkan')
        km.fit(X)
        X_new = km.transform(km.cluster_centers_).fetch()

        for c in range(n_clusters):
            assert X_new[c, c] == 0
            for c2 in range(n_clusters):
                if c != c2:
                    assert X_new[c, c2] > 0
Esempio n. 15
0
 def testKMeansInit(self):
     # non centered, sparse centers to check the
     centers = np.array([
         [0.0, 5.0, 0.0, 0.0, 0.0],
         [1.0, 1.0, 4.0, 0.0, 0.0],
         [1.0, 0.0, 0.0, 5.0, 1.0],
     ])
     n_samples = 100
     n_clusters, n_features = centers.shape
     X, true_labels = make_blobs(n_samples=n_samples,
                                 centers=centers,
                                 cluster_std=1.,
                                 random_state=42)
     X_csr = sp.csr_matrix(X)
     for data in [X, X_csr]:
         for init in ['random', 'k-means++', 'k-means||', centers.copy()]:
             km = KMeans(init=init,
                         n_clusters=n_clusters,
                         random_state=42,
                         n_init=1)
             km.fit(data)
             self._check_fitted_model(km, n_clusters, n_features,
                                      true_labels)
Esempio n. 16
0
    def testKMeansResults(self):
        representations = ['dense', 'sparse']
        dtypes = [np.float32, np.float64]
        algos = ['full', 'elkan']

        for representation in representations:
            array_constr = {
                'dense': np.array,
                'sparse': sp.csr_matrix
            }[representation]
            for dtype in dtypes:
                X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]],
                                 dtype=dtype)
                sample_weight = [3, 1, 1,
                                 3]  # will be rescaled to [1.5, 0.5, 0.5, 1.5]
                init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)

                expected_labels = [0, 0, 1, 1]
                expected_inertia = 0.1875
                expected_centers = np.array([[0.125, 0], [0.875, 1]],
                                            dtype=dtype)
                expected_n_iter = 2

                for algo in algos:
                    kmeans = KMeans(n_clusters=2,
                                    n_init=1,
                                    init=init_centers,
                                    algorithm=algo)
                    kmeans.fit(X, sample_weight=sample_weight)

                    np.testing.assert_array_equal(kmeans.labels_,
                                                  expected_labels)
                    np.testing.assert_almost_equal(kmeans.inertia_,
                                                   expected_inertia)
                    np.testing.assert_array_almost_equal(
                        kmeans.cluster_centers_, expected_centers)
                    self.assertEqual(kmeans.n_iter_, expected_n_iter)
Esempio n. 17
0
    def testElkanResults(self):
        # check that results are identical between lloyd and elkan algorithms
        distributions = ['normal', 'blobs']
        tols = [1e-2, 1e-4, 1e-8]

        for distribution in distributions:
            rnd = np.random.RandomState(0)
            if distribution == 'normal':
                X = rnd.normal(size=(5000, 10))
            else:
                X, _ = make_blobs(random_state=rnd)

            for tol in tols:
                km_full = KMeans(algorithm='full',
                                 n_clusters=5,
                                 random_state=0,
                                 n_init=1,
                                 tol=tol,
                                 init='k-means++')
                km_elkan = KMeans(algorithm='elkan',
                                  n_clusters=5,
                                  random_state=0,
                                  n_init=1,
                                  tol=tol,
                                  init='k-means++')

                km_full.fit(X)
                km_elkan.fit(X)
                np.testing.assert_allclose(km_elkan.cluster_centers_,
                                           km_full.cluster_centers_)
                np.testing.assert_array_equal(km_elkan.labels_,
                                              km_full.labels_)

                self.assertEqual(km_elkan.n_iter_, km_full.n_iter_)
                self.assertEqual(km_elkan.inertia_,
                                 pytest.approx(km_full.inertia_, rel=1e-6))
Esempio n. 18
0
def test_k_means_fit_predict(setup, algo, seed, max_iter, tol):
    # check that fit.predict gives same result as fit_predict
    rng = np.random.RandomState(seed)

    X = make_blobs(n_samples=1000, n_features=10, centers=10,
                   random_state=rng)[0]

    kmeans = KMeans(algorithm=algo,
                    n_clusters=10,
                    random_state=seed,
                    tol=tol,
                    max_iter=max_iter,
                    init='k-means++')

    labels_1 = kmeans.fit(X).predict(X)
    labels_2 = kmeans.fit_predict(X)

    # Due to randomness in the order in which chunks of data are processed when
    # using more than one thread, the absolute values of the labels can be
    # different between the 2 strategies but they should correspond to the same
    # clustering.
    assert pytest.approx(v_measure_score(labels_1, labels_2)) == 1