def test_kmeans_constraint_blobs20(self): data = make_blobs(n_samples=20, n_features=2, centers=2, cluster_std=1.0, center_box=(-10.0, 0.0), shuffle=True, random_state=0) X1 = data[0] data = make_blobs(n_samples=10, n_features=2, centers=2, cluster_std=1.0, center_box=(0.0, 10.0), shuffle=True, random_state=0) X2 = data[0] X = numpy.vstack([X1, X2]) km = ConstraintKMeans(n_clusters=4, verbose=0, kmeans0=False, random_state=2, strategy='gain', balanced_predictions=True, history=True) km.fit(X) pred = km.predict(X) diff = numpy.abs(km.labels_ - pred).sum() self.assertLesser(diff, 6) cls = km.cluster_centers_iter_ self.assertEqual(len(cls.shape), 3)
def test_kmeans_constraint_blobs(self): data = make_blobs(n_samples=8, n_features=2, centers=2, cluster_std=1.0, center_box=(-10.0, 0.0), shuffle=True, random_state=0) X1 = data[0] data = make_blobs(n_samples=4, n_features=2, centers=2, cluster_std=1.0, center_box=(0.0, 10.0), shuffle=True, random_state=0) X2 = data[0] X = numpy.vstack([X1, X2]) km = ConstraintKMeans(n_clusters=4, verbose=0, kmeans0=False, random_state=2, strategy='gain', balanced_predictions=True) km.fit(X) self.assertEqual(km.labels_[-2], km.labels_[-1]) self.assertIn(km.labels_[-1], {km.labels_[-4], km.labels_[-3]})
def test_kmeans_constraint_gain(self): mat = numpy.array([[0, 0], [0.2, 0.2], [-0.1, -0.1], [1, 1]]) km = ConstraintKMeans(n_clusters=2, verbose=0, kmeans0=False, random_state=1, strategy='gain') km.fit(mat) self.assertEqual(km.cluster_centers_.shape, (2, 2)) self.assertEqualFloat(km.inertia_, 0.455) self.assertEqual(km.cluster_centers_, numpy.array( [[0.6, 0.6], [-0.05, -0.05]])) self.assertEqual(km.labels_, numpy.array([1, 0, 1, 0])) pred = km.predict(mat) self.assertEqual(pred, numpy.array([1, 1, 1, 0]))
def test_kmeans_constraint_pickle(self): df = pandas.DataFrame(dict(y=[0, 1, 0, 1, 0, 1, 0, 1], X1=[0.5, 0.6, 0.52, 0.62, 0.5, 0.6, 0.51, 0.61], X2=[0.5, 0.6, 0.7, 0.5, 1.5, 1.6, 1.7, 1.8])) X = df.drop('y', axis=1) y = df['y'] model = ConstraintKMeans(n_clusters=2, strategy='distance') model.fit(X, y) pred = model.transform(X) st = BytesIO() pickle.dump(model, st) st = BytesIO(st.getvalue()) rec = pickle.load(st) pred2 = rec.transform(X) self.assertEqualArray(pred, pred2)
def test_kmeans_constraint_gain3(self): mat = numpy.array([[0, 0], [0.2, 0.2], [-0.1, -0.1], [1, 1], [1.1, 0.9], [-1.1, 1.]]) # Choose random_state=2 to get the labels [1 1 0 2 2 0]. # This configuration can only be modified with a permutation # of 3 elements. km = ConstraintKMeans(n_clusters=3, verbose=0, kmeans0=False, random_state=1, strategy='gain', balanced_predictions=True) km.fit(mat) self.assertEqual(km.cluster_centers_.shape, (3, 2)) lab = km.labels_ self.assertEqual(lab[1], lab[2]) self.assertEqual(lab[0], lab[5]) self.assertEqual(lab[3], lab[4]) pred = km.predict(mat) self.assertEqualArray(pred, lab)
def test_kmeans_constraint_sparse(self): mat = numpy.array([[0, 0], [0.2, 0.2], [-0.1, -0.1], [1, 1]]) mat = scipy.sparse.csr_matrix(mat) km = ConstraintKMeans(n_clusters=2, verbose=0, strategy='distance') km.fit(mat) self.assertEqual(km.cluster_centers_.shape, (2, 2)) self.assertEqualFloat(km.inertia_, 0.455) if km.labels_[0] == 0: self.assertEqual(km.labels_, numpy.array([0, 1, 0, 1])) self.assertEqual(km.cluster_centers_, numpy.array( [[-0.05, -0.05], [0.6, 0.6]])) else: self.assertEqual(km.labels_, numpy.array([1, 0, 1, 0])) self.assertEqual(km.cluster_centers_, numpy.array( [[0.6, 0.6], [-0.05, -0.05]])) pred = km.predict(mat) if km.labels_[0] == 0: self.assertEqual(pred, numpy.array([0, 0, 0, 1])) else: self.assertEqual(pred, numpy.array([1, 1, 1, 0]))
def test_kmeans_constraint_weights_bigger(self): n_samples = 100 data = make_blobs(n_samples=n_samples, n_features=2, centers=2, cluster_std=1.0, center_box=(-10.0, 0.0), shuffle=True, random_state=2) X1 = data[0] data = make_blobs(n_samples=n_samples // 2, n_features=2, centers=2, cluster_std=1.0, center_box=(0.0, 10.0), shuffle=True, random_state=2) X2 = data[0] X = numpy.vstack([X1, X2]) km = ConstraintKMeans(n_clusters=4, strategy='weights', history=True) km.fit(X) cl = km.predict(X) self.assertEqual(cl.shape, (X.shape[0], )) cls = km.cluster_centers_iter_ self.assertEqual(len(cls.shape), 3) edges = km.cluster_edges() self.assertIsInstance(edges, set) self.assertEqual(len(edges), 5) self.assertIsInstance(list(edges)[0], tuple)
def test_kmeans_constraint_pipeline(self): data = load_iris() X, y = data.data, data.target X_train, X_test, y_train, y_test = train_test_split(X, y) km = ConstraintKMeans(strategy='distance') pipe = make_pipeline(km, LogisticRegression()) pipe.fit(X_train, y_train) pred = pipe.predict(X_test) score = accuracy_score(y_test, pred) self.assertGreater(score, 0.8) score2 = pipe.score(X_test, y_test) self.assertEqual(score, score2) rp = repr(km) self.assertStartsWith("ConstraintKMeans(", rp)
def test_kmeans_constraint_pipeline(self): data = load_iris() X, y = data.data, data.target X_train, X_test, y_train, y_test = train_test_split(X, y) km = ConstraintKMeans(strategy='distance') pipe = make_pipeline(km, LogisticRegression()) try: pipe.fit(X_train, y_train) except AttributeError as e: if compare_module_version(sklver, "0.24") < 0: return raise e pred = pipe.predict(X_test) score = accuracy_score(y_test, pred) self.assertGreater(score, 0.8) score2 = pipe.score(X_test, y_test) self.assertEqual(score, score2) rp = repr(km) self.assertStartsWith("ConstraintKMeans(", rp)
def test_kmeans_constraint_grid(self): df = pandas.DataFrame(dict(y=[0, 1, 0, 1, 0, 1, 0, 1], X1=[0.5, 0.6, 0.52, 0.62, 0.5, 0.6, 0.51, 0.61], X2=[0.5, 0.6, 0.7, 0.5, 1.5, 1.6, 1.7, 1.8])) X = df.drop('y', axis=1) y = df['y'] model = make_pipeline(ConstraintKMeans(random_state=0, strategy='distance'), DecisionTreeClassifier()) res = model.get_params(True) self.assertNotEmpty(res) parameters = { 'constraintkmeans__n_clusters': [2, 3, 4], 'constraintkmeans__balanced_predictions': [False, True], } clf = GridSearchCV(model, parameters, cv=3) clf.fit(X, y) pred = clf.predict(X) self.assertEqual(pred.shape, (8,))
def test_kmeans_constraint_weights(self): mat = numpy.array([[0, 0], [0.2, 0.2], [-0.1, -0.1], [1, 1]]) km = ConstraintKMeans(n_clusters=2, verbose=10, kmeans0=False, random_state=1, strategy='weights') buf = BufferedPrint() km.fit(mat, fLOG=buf.fprint) km = ConstraintKMeans(n_clusters=2, verbose=5, kmeans0=False, random_state=1, strategy='weights') km.fit(mat, fLOG=buf.fprint) self.assertEqual(km.cluster_centers_.shape, (2, 2)) self.assertLesser(km.inertia_, 4.55) self.assertEqual(km.cluster_centers_, numpy.array( [[0.6, 0.6], [-0.05, -0.05]])) self.assertEqual(km.labels_, numpy.array([1, 0, 1, 0])) pred = km.predict(mat) self.assertEqual(pred, numpy.array([1, 1, 1, 0])) dist = km.transform(mat) self.assertEqual(dist.shape, (4, 2)) score = km.score(mat) self.assertEqual(score.shape, (4, )) self.assertIn("CKMeans", str(buf))
def test_kmeans_constraint_exc(self): self.assertRaise(lambda: ConstraintKMeans( n_clusters=2, strategy='r'), ValueError)
colors = 'brgy' fig, ax = plt.subplots(1, 1, figsize=(4, 4)) for i in range(0, max(cl) + 1): ax.plot(X[cl == i, 0], X[cl == i, 1], colors[i] + '.', label='cl%d' % i) x = [km.cluster_centers_[i, 0], km.cluster_centers_[i, 0]] y = [km.cluster_centers_[i, 1], km.cluster_centers_[i, 1]] ax.plot(x, y, colors[i] + '+') ax.set_title('KMeans 4 clusters\n%r' % hist) ax.legend() ##################################### # Constraint KMeans # ================= km1 = ConstraintKMeans(n_clusters=4, strategy='gain', balanced_predictions=True) km1.fit(X) km2 = ConstraintKMeans(n_clusters=4, strategy='distance', balanced_predictions=True) km2.fit(X) ########################## # This algorithm tries to exchange points # between clusters. cl1 = km1.predict(X) hist1 = Counter(cl1)