def test_kmeans_constraint_blobs20(self):
     data = make_blobs(n_samples=20,
                       n_features=2,
                       centers=2,
                       cluster_std=1.0,
                       center_box=(-10.0, 0.0),
                       shuffle=True,
                       random_state=0)
     X1 = data[0]
     data = make_blobs(n_samples=10,
                       n_features=2,
                       centers=2,
                       cluster_std=1.0,
                       center_box=(0.0, 10.0),
                       shuffle=True,
                       random_state=0)
     X2 = data[0]
     X = numpy.vstack([X1, X2])
     km = ConstraintKMeans(n_clusters=4,
                           verbose=0,
                           kmeans0=False,
                           random_state=2,
                           strategy='gain',
                           balanced_predictions=True,
                           history=True)
     km.fit(X)
     pred = km.predict(X)
     diff = numpy.abs(km.labels_ - pred).sum()
     self.assertLesser(diff, 6)
     cls = km.cluster_centers_iter_
     self.assertEqual(len(cls.shape), 3)
 def test_kmeans_constraint_blobs(self):
     data = make_blobs(n_samples=8,
                       n_features=2,
                       centers=2,
                       cluster_std=1.0,
                       center_box=(-10.0, 0.0),
                       shuffle=True,
                       random_state=0)
     X1 = data[0]
     data = make_blobs(n_samples=4,
                       n_features=2,
                       centers=2,
                       cluster_std=1.0,
                       center_box=(0.0, 10.0),
                       shuffle=True,
                       random_state=0)
     X2 = data[0]
     X = numpy.vstack([X1, X2])
     km = ConstraintKMeans(n_clusters=4,
                           verbose=0,
                           kmeans0=False,
                           random_state=2,
                           strategy='gain',
                           balanced_predictions=True)
     km.fit(X)
     self.assertEqual(km.labels_[-2], km.labels_[-1])
     self.assertIn(km.labels_[-1], {km.labels_[-4], km.labels_[-3]})
 def test_kmeans_constraint_gain(self):
     mat = numpy.array([[0, 0], [0.2, 0.2], [-0.1, -0.1], [1, 1]])
     km = ConstraintKMeans(n_clusters=2, verbose=0, kmeans0=False,
                           random_state=1, strategy='gain')
     km.fit(mat)
     self.assertEqual(km.cluster_centers_.shape, (2, 2))
     self.assertEqualFloat(km.inertia_, 0.455)
     self.assertEqual(km.cluster_centers_, numpy.array(
         [[0.6, 0.6], [-0.05, -0.05]]))
     self.assertEqual(km.labels_, numpy.array([1, 0, 1, 0]))
     pred = km.predict(mat)
     self.assertEqual(pred, numpy.array([1, 1, 1, 0]))
 def test_kmeans_constraint_pickle(self):
     df = pandas.DataFrame(dict(y=[0, 1, 0, 1, 0, 1, 0, 1],
                                X1=[0.5, 0.6, 0.52, 0.62,
                                    0.5, 0.6, 0.51, 0.61],
                                X2=[0.5, 0.6, 0.7, 0.5, 1.5, 1.6, 1.7, 1.8]))
     X = df.drop('y', axis=1)
     y = df['y']
     model = ConstraintKMeans(n_clusters=2, strategy='distance')
     model.fit(X, y)
     pred = model.transform(X)
     st = BytesIO()
     pickle.dump(model, st)
     st = BytesIO(st.getvalue())
     rec = pickle.load(st)
     pred2 = rec.transform(X)
     self.assertEqualArray(pred, pred2)
 def test_kmeans_constraint_gain3(self):
     mat = numpy.array([[0, 0], [0.2, 0.2], [-0.1, -0.1],
                        [1, 1], [1.1, 0.9], [-1.1, 1.]])
     # Choose random_state=2 to get the labels [1 1 0 2 2 0].
     # This configuration can only be modified with a permutation
     # of 3 elements.
     km = ConstraintKMeans(n_clusters=3, verbose=0, kmeans0=False,
                           random_state=1, strategy='gain',
                           balanced_predictions=True)
     km.fit(mat)
     self.assertEqual(km.cluster_centers_.shape, (3, 2))
     lab = km.labels_
     self.assertEqual(lab[1], lab[2])
     self.assertEqual(lab[0], lab[5])
     self.assertEqual(lab[3], lab[4])
     pred = km.predict(mat)
     self.assertEqualArray(pred, lab)
 def test_kmeans_constraint_sparse(self):
     mat = numpy.array([[0, 0], [0.2, 0.2], [-0.1, -0.1], [1, 1]])
     mat = scipy.sparse.csr_matrix(mat)
     km = ConstraintKMeans(n_clusters=2, verbose=0, strategy='distance')
     km.fit(mat)
     self.assertEqual(km.cluster_centers_.shape, (2, 2))
     self.assertEqualFloat(km.inertia_, 0.455)
     if km.labels_[0] == 0:
         self.assertEqual(km.labels_, numpy.array([0, 1, 0, 1]))
         self.assertEqual(km.cluster_centers_, numpy.array(
             [[-0.05, -0.05], [0.6, 0.6]]))
     else:
         self.assertEqual(km.labels_, numpy.array([1, 0, 1, 0]))
         self.assertEqual(km.cluster_centers_, numpy.array(
             [[0.6, 0.6], [-0.05, -0.05]]))
     pred = km.predict(mat)
     if km.labels_[0] == 0:
         self.assertEqual(pred, numpy.array([0, 0, 0, 1]))
     else:
         self.assertEqual(pred, numpy.array([1, 1, 1, 0]))
 def test_kmeans_constraint_weights_bigger(self):
     n_samples = 100
     data = make_blobs(n_samples=n_samples,
                       n_features=2,
                       centers=2,
                       cluster_std=1.0,
                       center_box=(-10.0, 0.0),
                       shuffle=True,
                       random_state=2)
     X1 = data[0]
     data = make_blobs(n_samples=n_samples // 2,
                       n_features=2,
                       centers=2,
                       cluster_std=1.0,
                       center_box=(0.0, 10.0),
                       shuffle=True,
                       random_state=2)
     X2 = data[0]
     X = numpy.vstack([X1, X2])
     km = ConstraintKMeans(n_clusters=4, strategy='weights', history=True)
     km.fit(X)
     cl = km.predict(X)
     self.assertEqual(cl.shape, (X.shape[0], ))
     cls = km.cluster_centers_iter_
     self.assertEqual(len(cls.shape), 3)
     edges = km.cluster_edges()
     self.assertIsInstance(edges, set)
     self.assertEqual(len(edges), 5)
     self.assertIsInstance(list(edges)[0], tuple)
 def test_kmeans_constraint_pipeline(self):
     data = load_iris()
     X, y = data.data, data.target
     X_train, X_test, y_train, y_test = train_test_split(X, y)
     km = ConstraintKMeans(strategy='distance')
     pipe = make_pipeline(km, LogisticRegression())
     pipe.fit(X_train, y_train)
     pred = pipe.predict(X_test)
     score = accuracy_score(y_test, pred)
     self.assertGreater(score, 0.8)
     score2 = pipe.score(X_test, y_test)
     self.assertEqual(score, score2)
     rp = repr(km)
     self.assertStartsWith("ConstraintKMeans(", rp)
 def test_kmeans_constraint_pipeline(self):
     data = load_iris()
     X, y = data.data, data.target
     X_train, X_test, y_train, y_test = train_test_split(X, y)
     km = ConstraintKMeans(strategy='distance')
     pipe = make_pipeline(km, LogisticRegression())
     try:
         pipe.fit(X_train, y_train)
     except AttributeError as e:
         if compare_module_version(sklver, "0.24") < 0:
             return
         raise e
     pred = pipe.predict(X_test)
     score = accuracy_score(y_test, pred)
     self.assertGreater(score, 0.8)
     score2 = pipe.score(X_test, y_test)
     self.assertEqual(score, score2)
     rp = repr(km)
     self.assertStartsWith("ConstraintKMeans(", rp)
    def test_kmeans_constraint_grid(self):
        df = pandas.DataFrame(dict(y=[0, 1, 0, 1, 0, 1, 0, 1],
                                   X1=[0.5, 0.6, 0.52, 0.62,
                                       0.5, 0.6, 0.51, 0.61],
                                   X2=[0.5, 0.6, 0.7, 0.5,
                                       1.5, 1.6, 1.7, 1.8]))
        X = df.drop('y', axis=1)
        y = df['y']
        model = make_pipeline(ConstraintKMeans(random_state=0, strategy='distance'),
                              DecisionTreeClassifier())
        res = model.get_params(True)
        self.assertNotEmpty(res)

        parameters = {
            'constraintkmeans__n_clusters': [2, 3, 4],
            'constraintkmeans__balanced_predictions': [False, True],
        }
        clf = GridSearchCV(model, parameters, cv=3)
        clf.fit(X, y)
        pred = clf.predict(X)
        self.assertEqual(pred.shape, (8,))
    def test_kmeans_constraint_weights(self):
        mat = numpy.array([[0, 0], [0.2, 0.2], [-0.1, -0.1], [1, 1]])
        km = ConstraintKMeans(n_clusters=2, verbose=10, kmeans0=False,
                              random_state=1, strategy='weights')
        buf = BufferedPrint()
        km.fit(mat, fLOG=buf.fprint)

        km = ConstraintKMeans(n_clusters=2, verbose=5, kmeans0=False,
                              random_state=1, strategy='weights')
        km.fit(mat, fLOG=buf.fprint)

        self.assertEqual(km.cluster_centers_.shape, (2, 2))
        self.assertLesser(km.inertia_, 4.55)
        self.assertEqual(km.cluster_centers_, numpy.array(
            [[0.6, 0.6], [-0.05, -0.05]]))
        self.assertEqual(km.labels_, numpy.array([1, 0, 1, 0]))
        pred = km.predict(mat)
        self.assertEqual(pred, numpy.array([1, 1, 1, 0]))
        dist = km.transform(mat)
        self.assertEqual(dist.shape, (4, 2))
        score = km.score(mat)
        self.assertEqual(score.shape, (4, ))
        self.assertIn("CKMeans", str(buf))
 def test_kmeans_constraint_exc(self):
     self.assertRaise(lambda: ConstraintKMeans(
         n_clusters=2, strategy='r'), ValueError)
Esempio n. 13
0
colors = 'brgy'
fig, ax = plt.subplots(1, 1, figsize=(4, 4))
for i in range(0, max(cl) + 1):
    ax.plot(X[cl == i, 0], X[cl == i, 1], colors[i] + '.', label='cl%d' % i)
    x = [km.cluster_centers_[i, 0], km.cluster_centers_[i, 0]]
    y = [km.cluster_centers_[i, 1], km.cluster_centers_[i, 1]]
    ax.plot(x, y, colors[i] + '+')
ax.set_title('KMeans 4 clusters\n%r' % hist)
ax.legend()

#####################################
# Constraint KMeans
# =================

km1 = ConstraintKMeans(n_clusters=4,
                       strategy='gain',
                       balanced_predictions=True)
km1.fit(X)

km2 = ConstraintKMeans(n_clusters=4,
                       strategy='distance',
                       balanced_predictions=True)
km2.fit(X)

##########################
# This algorithm tries to exchange points
# between clusters.

cl1 = km1.predict(X)
hist1 = Counter(cl1)