def setUp(self):
        self.points = np.array([[2.5, 0.1], [2, 0.2], [3, 0.1], [4, 0.2],
                                [0.1, 2.5], [0.2, 2], [0.1, 3], [0.2, 4]],
                               dtype=np.float32)
        self.num_points = self.points.shape[0]
        self.true_centers = np.array([
            normalize(
                np.mean(normalize(self.points)[0:4, :], axis=0,
                        keepdims=True))[0],
            normalize(
                np.mean(normalize(self.points)[4:, :], axis=0,
                        keepdims=True))[0]
        ],
                                     dtype=np.float32)
        self.true_assignments = np.array([0] * 4 + [1] * 4)
        self.true_score = len(self.points) - np.tensordot(
            normalize(self.points), self.true_centers[self.true_assignments])

        self.num_centers = 2
        self.kmeans = kmeans_lib.KMeansClustering(
            self.num_centers,
            initial_clusters=kmeans_lib.KMeansClustering.RANDOM_INIT,
            distance_metric=kmeans_lib.KMeansClustering.COSINE_DISTANCE,
            use_mini_batch=self.use_mini_batch,
            mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
            config=self.config(3))
 def _kmeans(self, relative_tolerance=None):
     return kmeans_lib.KMeansClustering(
         self.num_centers,
         initial_clusters=self.initial_clusters,
         distance_metric=kmeans_lib.KMeansClustering.
         SQUARED_EUCLIDEAN_DISTANCE,
         use_mini_batch=self.use_mini_batch,
         mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
         random_seed=24,
         relative_tolerance=relative_tolerance)
    def test_predict_kmeans_plus_plus(self):
        # Most points are concentrated near one center. KMeans++ is likely to find
        # the less populated centers.
        points = np.array([[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3],
                           [-3.1, -3.2], [-2.8, -3.], [-2.9, -3.1],
                           [-3., -3.1], [-3., -3.1], [-3.2, -3.], [-3., -3.]],
                          dtype=np.float32)
        true_centers = np.array([
            normalize(np.mean(normalize(points)[0:2, :], axis=0,
                              keepdims=True))[0],
            normalize(np.mean(normalize(points)[2:4, :], axis=0,
                              keepdims=True))[0],
            normalize(np.mean(normalize(points)[4:, :], axis=0,
                              keepdims=True))[0]
        ],
                                dtype=np.float32)
        true_assignments = [0] * 2 + [1] * 2 + [2] * 8
        true_score = len(points) - np.tensordot(normalize(points),
                                                true_centers[true_assignments])
        kmeans = kmeans_lib.KMeansClustering(
            3,
            initial_clusters=self.initial_clusters,
            distance_metric=kmeans_lib.KMeansClustering.COSINE_DISTANCE,
            use_mini_batch=self.use_mini_batch,
            mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
            config=self.config(3))
        kmeans.train(input_fn=lambda: (constant_op.constant(points), None),
                     steps=30)

        centers = normalize(kmeans.cluster_centers())
        self.assertAllClose(sorted(centers.tolist()),
                            sorted(true_centers.tolist()),
                            atol=1e-2)

        def _input_fn():
            return (input_lib.limit_epochs(constant_op.constant(points),
                                           num_epochs=1), None)

        assignments = list(kmeans.predict_cluster_index(input_fn=_input_fn))
        self.assertAllClose(centers[assignments],
                            true_centers[true_assignments],
                            atol=1e-2)

        score = kmeans.score(
            input_fn=lambda: (constant_op.constant(points), None))
        self.assertAllClose(score, true_score, atol=1e-2)
 def test_kmeans_plus_plus_batch_too_small(self):
     points = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]],
                       dtype=np.float32)
     kmeans = kmeans_lib.KMeansClustering(
         num_clusters=points.shape[0],
         initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
         distance_metric=kmeans_lib.KMeansClustering.
         SQUARED_EUCLIDEAN_DISTANCE,
         use_mini_batch=True,
         mini_batch_steps_per_iteration=100,
         random_seed=24,
         relative_tolerance=None)
     with self.assertRaisesOpError(AssertionError):
         kmeans.train(input_fn=self.input_fn(batch_size=4,
                                             points=points,
                                             randomize=False),
                      steps=1)
 def test_kmeans_plus_plus_batch_just_right(self):
     points = np.array([[1, 2]], dtype=np.float32)
     kmeans = kmeans_lib.KMeansClustering(
         num_clusters=points.shape[0],
         initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT,
         distance_metric=kmeans_lib.KMeansClustering.
         SQUARED_EUCLIDEAN_DISTANCE,
         use_mini_batch=True,
         mini_batch_steps_per_iteration=100,
         random_seed=24,
         relative_tolerance=None)
     kmeans.train(input_fn=self.input_fn(batch_size=1,
                                         points=points,
                                         randomize=False),
                  steps=1)
     clusters = kmeans.cluster_centers()
     self.assertAllEqual(points, clusters)
 def _fit(self, num_iters=10):
     scores = []
     start = time.time()
     for i in range(num_iters):
         print('Starting tensorflow KMeans: %d' % i)
         tf_kmeans = kmeans_lib.KMeansClustering(
             self.num_clusters,
             initial_clusters=kmeans_lib.KMeansClustering.
             KMEANS_PLUS_PLUS_INIT,
             kmeans_plus_plus_num_retries=int(
                 math.log(self.num_clusters) + 2),
             random_seed=i * 42,
             relative_tolerance=1e-6,
             config=self.config(3))
         tf_kmeans.train(input_fn=lambda:
                         (constant_op.constant(self.points), None),
                         steps=50)
         _ = tf_kmeans.cluster_centers()
         scores.append(
             tf_kmeans.score(input_fn=lambda:
                             (constant_op.constant(self.points), None)))
     self._report(num_iters, start, time.time(), scores)
    def test_monitor(self):
        if self.use_mini_batch:
            # We don't test for use_mini_batch case since the loss value can be noisy.
            return
        kmeans = kmeans_lib.KMeansClustering(
            self.num_centers,
            initial_clusters=self.initial_clusters,
            distance_metric=kmeans_lib.KMeansClustering.
            SQUARED_EUCLIDEAN_DISTANCE,
            use_mini_batch=self.use_mini_batch,
            mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration,
            config=self.config(14),
            random_seed=12,
            relative_tolerance=1e-4)

        kmeans.train(
            input_fn=self.input_fn(),
            # Force it to train until the relative tolerance monitor stops it.
            steps=None)
        score = kmeans.score(input_fn=self.input_fn(
            batch_size=self.num_points))
        self.assertNear(self.true_score, score, self.true_score * 0.01)
 def test_queues(self):
     kmeans = kmeans_lib.KMeansClustering(5)
     kmeans.train(input_fn=self.input_fn(), steps=1)