Example #1
0
  def test_transform_with_cosine_distance(self):
    points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18],
                       [-2.5, -3.5], [-2, -8], [-3, -1], [-3, -18]])

    true_centers = [normalize(np.mean(normalize(points)[4:, :], axis=0,
                                      keepdims=True))[0],
                    normalize(np.mean(normalize(points)[0:4, :], axis=0,
                                      keepdims=True))[0]]

    kmeans = KMeans(2,
                    initial_clusters=kmeans_ops.RANDOM_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
                    batch_size=8,
                    continue_training=True,
                    config=run_config.RunConfig(tf_random_seed=3))
    kmeans.fit(x=points, steps=30)

    centers = normalize(kmeans.clusters())
    self.assertAllClose(np.sort(centers, axis=0),
                        np.sort(true_centers, axis=0),
                        atol=1e-2)

    true_transform = 1 - cosine_similarity(points, centers)
    transform = kmeans.transform(points)
    self.assertAllClose(transform, true_transform, atol=1e-3)
Example #2
0
    def test_transform_with_cosine_distance(self):
        points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18], [-2.5, -3.5],
                           [-2, -8], [-3, -1], [-3, -18]])

        true_centers = [
            normalize(np.mean(normalize(points)[4:, :], axis=0,
                              keepdims=True))[0],
            normalize(np.mean(normalize(points)[0:4, :], axis=0,
                              keepdims=True))[0]
        ]

        kmeans = KMeans(2,
                        initial_clusters=kmeans_ops.RANDOM_INIT,
                        distance_metric=kmeans_ops.COSINE_DISTANCE,
                        use_mini_batch=self.use_mini_batch,
                        config=self.config(3))
        kmeans.fit(x=points, steps=30, batch_size=8)

        centers = normalize(kmeans.clusters())
        self.assertAllClose(np.sort(centers, axis=0),
                            np.sort(true_centers, axis=0),
                            atol=1e-2)

        true_transform = 1 - cosine_similarity(points, centers)
        transform = kmeans.transform(points, batch_size=8)
        self.assertAllClose(transform, true_transform, atol=1e-3)
Example #3
0
  def test_transform_with_cosine_distance(self):
    points = np.array(
        [[2.5, 0.1], [2, 0.2], [3, 0.1], [4, 0.2],
         [0.1, 2.5], [0.2, 2], [0.1, 3], [0.2, 4]], dtype=np.float32)

    true_centers = [normalize(np.mean(normalize(points)[4:, :], axis=0,
                                      keepdims=True))[0],
                    normalize(np.mean(normalize(points)[0:4, :], axis=0,
                                      keepdims=True))[0]]

    kmeans = KMeans(2,
                    initial_clusters=kmeans_ops.RANDOM_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
                    config=self.config(5))
    kmeans.fit(x=points, steps=50, batch_size=8)

    centers = normalize(kmeans.clusters())
    self.assertAllClose(np.sort(centers, axis=0),
                        np.sort(true_centers, axis=0),
                        atol=1e-2)

    true_transform = 1 - cosine_similarity(points, centers)
    transform = kmeans.transform(points, batch_size=8)
    self.assertAllClose(transform, true_transform, atol=1e-3)
Example #4
0
  def test_fit_raise_if_num_clusters_larger_than_num_points_kmeans_plus_plus(
      self):
    points = np.array([[2.0, 3.0], [1.6, 8.2]])

    with self.assertRaisesOpError(AssertionError):
      kmeans = KMeans(num_clusters=3,
                      initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT)
      kmeans.fit(x=points, steps=10, batch_size=8)
Example #5
0
    def test_fit_raise_if_num_clusters_larger_than_num_points_random_init(
            self):
        points = np.array([[2.0, 3.0], [1.6, 8.2]])

        with self.assertRaisesOpError('less'):
            kmeans = KMeans(num_clusters=3,
                            initial_clusters=kmeans_ops.RANDOM_INIT)
            kmeans.fit(x=points, steps=10, batch_size=8)
Example #6
0
    def test_fit_raise_if_num_clusters_larger_than_num_points_kmeans_plus_plus(
            self):
        points = np.array([[2.0, 3.0], [1.6, 8.2]])

        with self.assertRaisesOpError(AssertionError):
            kmeans = KMeans(num_clusters=3,
                            initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT)
            kmeans.fit(x=points, steps=10, batch_size=8)
Example #7
0
 def test_fit_with_cosine_distance(self):
   # Create points on y=x and y=1.5x lines to check the cosine similarity.
   # Note that euclidean distance will give different results in this case.
   points = np.array([[9, 9], [0.5, 0.5], [10, 15], [0.4, 0.6]])
   # true centers are the unit vectors on lines y=x and y=1.5x
   true_centers = np.array([[0.70710678, 0.70710678], [0.5547002, 0.83205029]])
   kmeans = KMeans(2,
                   initial_clusters=kmeans_ops.RANDOM_INIT,
                   distance_metric=kmeans_ops.COSINE_DISTANCE,
                   use_mini_batch=self.use_mini_batch,
                   config=self.config(2),
                   random_seed=12)
   kmeans.fit(x=points, steps=10, batch_size=4)
   centers = normalize(kmeans.clusters())
   self.assertAllClose(np.sort(centers, axis=0),
                       np.sort(true_centers, axis=0))
Example #8
0
 def _fit(self, num_iters=10):
   scores = []
   start = time.time()
   for i in range(num_iters):
     print('Starting tensorflow KMeans: %d' % i)
     tf_kmeans = KMeans(self.num_clusters,
                        initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT,
                        kmeans_plus_plus_num_retries=int(
                            math.log(self.num_clusters) + 2),
                        random_seed=i * 42,
                        config=run_config.RunConfig(tf_random_seed=3))
     tf_kmeans.fit(x=self.points, batch_size=self.num_points, steps=50,
                   relative_tolerance=1e-6)
     _ = tf_kmeans.clusters()
     scores.append(tf_kmeans.score(self.points))
   self._report(num_iters, start, time.time(), scores)
Example #9
0
 def test_fit_with_cosine_distance(self):
     # Create points on y=x and y=1.5x lines to check the cosine similarity.
     # Note that euclidean distance will give different results in this case.
     points = np.array([[9, 9], [0.5, 0.5], [10, 15], [0.4, 0.6]])
     # true centers are the unit vectors on lines y=x and y=1.5x
     true_centers = np.array([[0.70710678, 0.70710678],
                              [0.5547002, 0.83205029]])
     kmeans = KMeans(2,
                     initial_clusters=kmeans_ops.RANDOM_INIT,
                     distance_metric=kmeans_ops.COSINE_DISTANCE,
                     use_mini_batch=self.use_mini_batch,
                     config=self.config(2),
                     random_seed=12)
     kmeans.fit(x=points, steps=10, batch_size=4)
     centers = normalize(kmeans.clusters())
     self.assertAllClose(np.sort(centers, axis=0),
                         np.sort(true_centers, axis=0))
Example #10
0
  def test_monitor(self):
    if self.batch_size != self.num_points:
      # TODO(agarwal): Doesn't work with mini-batch.
      return
    kmeans = KMeans(self.num_centers,
                    initial_clusters=kmeans_ops.RANDOM_INIT,
                    use_mini_batch=self.use_mini_batch,
                    config=run_config.RunConfig(tf_random_seed=14),
                    random_seed=12)

    kmeans.fit(x=self.points,
               # Force it to train forever until the monitor stops it.
               steps=None,
               batch_size=self.batch_size,
               relative_tolerance=1e-4)
    score = kmeans.score(x=self.points)
    self.assertNear(self.true_score, score, self.true_score * 0.005)
Example #11
0
    def setUp(self):
        np.random.seed(3)
        tf.set_random_seed(2)
        self.num_centers = 2
        self.num_dims = 2
        self.num_points = 4000
        self.batch_size = 100
        self.true_centers = self.make_random_centers(self.num_centers,
                                                     self.num_dims)
        self.points, self.assignments, self.scores = self.make_random_points(
            self.true_centers, self.num_points)
        self.true_score = np.add.reduce(self.scores)

        # Use initial means from kmeans (just like scikit-learn does).
        clusterer = KMeans(num_clusters=self.num_centers)
        clusterer.fit(self.points, steps=30)
        self.initial_means = clusterer.clusters()
Example #12
0
  def setUp(self):
    np.random.seed(3)
    tf.set_random_seed(2)
    self.num_centers = 2
    self.num_dims = 2
    self.num_points = 4000
    self.batch_size = 100
    self.true_centers = self.make_random_centers(self.num_centers,
                                                 self.num_dims)
    self.points, self.assignments, self.scores = self.make_random_points(
        self.true_centers,
        self.num_points)
    self.true_score = np.add.reduce(self.scores)

    # Use initial means from kmeans (just like scikit-learn does).
    clusterer = KMeans(num_clusters=self.num_centers)
    clusterer.fit(self.points, steps=30)
    self.initial_means = clusterer.clusters()
Example #13
0
  def test_predict_with_cosine_distance_and_kmeans_plus_plus(self):
    # Most points are concetrated near one center. KMeans++ is likely to find
    # the less populated centers.
    points = np.array([[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3],
                       [-3.1, -3.2], [-2.8, -3.], [-2.9, -3.1], [-3., -3.1],
                       [-3., -3.1], [-3.2, -3.], [-3., -3.]]).astype(np.float32)
    true_centers = np.array(
        [normalize(np.mean(normalize(points)[0:2, :], axis=0,
                           keepdims=True))[0],
         normalize(np.mean(normalize(points)[2:4, :], axis=0,
                           keepdims=True))[0],
         normalize(np.mean(normalize(points)[4:, :], axis=0,
                           keepdims=True))[0]])
    true_assignments = [0] * 2 + [1] * 2 + [2] * 8
    true_score = len(points) - np.tensordot(normalize(points),
                                            true_centers[true_assignments])

    kmeans = KMeans(3,
                    initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
                    config=self.config(3))
    kmeans.fit(x=points, steps=30, batch_size=12)

    centers = normalize(kmeans.clusters())
    self.assertAllClose(sorted(centers.tolist()),
                        sorted(true_centers.tolist()),
                        atol=1e-2)

    assignments = kmeans.predict(points, batch_size=12)
    self.assertAllClose(centers[assignments],
                        true_centers[true_assignments], atol=1e-2)

    score = kmeans.score(points, batch_size=12)
    self.assertAllClose(score, true_score, atol=1e-2)
Example #14
0
  def test_predict_with_cosine_distance_and_kmeans_plus_plus(self):
    # Most points are concetrated near one center. KMeans++ is likely to find
    # the less populated centers.
    points = np.array([[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3],
                       [-3.1, -3.2], [-2.8, -3.], [-2.9, -3.1], [-3., -3.1],
                       [-3., -3.1], [-3.2, -3.], [-3., -3.]], dtype=np.float32)
    true_centers = np.array(
        [normalize(np.mean(normalize(points)[0:2, :], axis=0,
                           keepdims=True))[0],
         normalize(np.mean(normalize(points)[2:4, :], axis=0,
                           keepdims=True))[0],
         normalize(np.mean(normalize(points)[4:, :], axis=0,
                           keepdims=True))[0]], dtype=np.float32)
    true_assignments = [0] * 2 + [1] * 2 + [2] * 8
    true_score = len(points) - np.tensordot(normalize(points),
                                            true_centers[true_assignments])

    kmeans = KMeans(3,
                    initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
                    config=self.config(3))
    kmeans.fit(x=points, steps=30, batch_size=12)

    centers = normalize(kmeans.clusters())
    self.assertAllClose(sorted(centers.tolist()),
                        sorted(true_centers.tolist()),
                        atol=1e-2)

    assignments = kmeans.predict(points, batch_size=12)
    self.assertAllClose(centers[assignments],
                        true_centers[true_assignments], atol=1e-2)

    score = kmeans.score(points, batch_size=12)
    self.assertAllClose(score, true_score, atol=1e-2)
Example #15
0
    def test_predict_with_cosine_distance(self):
        points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18], [-2.5, -3.5],
                           [-2, -8], [-3, -1], [-3, -18]]).astype(np.float32)
        true_centers = np.array([
            normalize(np.mean(normalize(points)[0:4, :], axis=0,
                              keepdims=True))[0],
            normalize(np.mean(normalize(points)[4:, :], axis=0,
                              keepdims=True))[0]
        ])
        true_assignments = [0] * 4 + [1] * 4
        true_score = len(points) - np.tensordot(normalize(points),
                                                true_centers[true_assignments])

        kmeans = KMeans(2,
                        initial_clusters=kmeans_ops.RANDOM_INIT,
                        distance_metric=kmeans_ops.COSINE_DISTANCE,
                        use_mini_batch=self.use_mini_batch,
                        config=self.config(3))
        kmeans.fit(x=points, steps=30, batch_size=8)

        centers = normalize(kmeans.clusters())
        self.assertAllClose(np.sort(centers, axis=0),
                            np.sort(true_centers, axis=0),
                            atol=1e-2)

        assignments = kmeans.predict(points, batch_size=8)
        self.assertAllClose(centers[assignments],
                            true_centers[true_assignments],
                            atol=1e-2)

        score = kmeans.score(points, batch_size=8)
        self.assertAllClose(score, true_score, atol=1e-2)
Example #16
0
  def test_predict_with_cosine_distance(self):
    points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18],
                       [-2.5, -3.5], [-2, -8], [-3, -1], [-3, -18]]).astype(
                           np.float32)
    true_centers = np.array(
        [normalize(np.mean(normalize(points)[0:4, :],
                           axis=0,
                           keepdims=True))[0],
         normalize(np.mean(normalize(points)[4:, :],
                           axis=0,
                           keepdims=True))[0]])
    true_assignments = [0] * 4 + [1] * 4
    true_score = len(points) - np.tensordot(normalize(points),
                                            true_centers[true_assignments])

    kmeans = KMeans(2,
                    initial_clusters=kmeans_ops.RANDOM_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
                    config=self.config(3))
    kmeans.fit(x=points, steps=30, batch_size=8)

    centers = normalize(kmeans.clusters())
    self.assertAllClose(np.sort(centers, axis=0),
                        np.sort(true_centers, axis=0), atol=1e-2)

    assignments = kmeans.predict(points, batch_size=8)
    self.assertAllClose(centers[assignments],
                        true_centers[true_assignments], atol=1e-2)

    score = kmeans.score(points, batch_size=8)
    self.assertAllClose(score, true_score, atol=1e-2)
Example #17
0
#以上两个函数时利用Numpy制造比较适合做聚类的一组数据。
#我们生成二维的10000个点,6个随机的聚类中心点
num_centers = 6
num_dims = 2
num_points = 10000
true_centers = make_random_centers(num_centers,num_dims)
points, _, scores = make_random_points(true_centers,num_points)


from tensorflow.contrib.factorization.python.ops import kmeans as kmeans_ops
from tensorflow.contrib.factorization.python.ops.kmeans import \
    KMeansClustering as KMeans
kmeans = KMeans(num_centers=num_centers,
                initial_clusters = kmeans_ops.RANDOM_INIT,
                use_mini_batch=False,
                config=RunConfig(tf_random_seed=14),
                random_seed=12)
kmeans.fit(x=points,steps=10,batch_size=8)
clusters = kmeans.clusters()

kmeans.predict(points,batch_size=128)
kmeans.score(points,batch_size=128)
kmeans.transform(points,batch_size=128)


####################################
#支持向量机
 def input_fn():
     return{
         'example_id':tf.constant(['1','2','3']),
Example #18
0
  def test_fit_raise_if_num_clusters_larger_than_num_points_random_init(self):
    points = np.array([[2.0, 3.0], [1.6, 8.2]])

    with self.assertRaisesOpError('less'):
      kmeans = KMeans(num_clusters=3, initial_clusters=kmeans_ops.RANDOM_INIT)
      kmeans.fit(x=points, steps=10, batch_size=8)