def setUp(self): self.points = np.array([[2.5, 0.1], [2, 0.2], [3, 0.1], [4, 0.2], [0.1, 2.5], [0.2, 2], [0.1, 3], [0.2, 4]], dtype=np.float32) self.num_points = self.points.shape[0] self.true_centers = np.array([ normalize( np.mean(normalize(self.points)[0:4, :], axis=0, keepdims=True))[0], normalize( np.mean(normalize(self.points)[4:, :], axis=0, keepdims=True))[0] ], dtype=np.float32) self.true_assignments = np.array([0] * 4 + [1] * 4) self.true_score = len(self.points) - np.tensordot( normalize(self.points), self.true_centers[self.true_assignments]) self.num_centers = 2 self.kmeans = kmeans_lib.KMeansClustering( self.num_centers, initial_clusters=kmeans_lib.KMeansClustering.RANDOM_INIT, distance_metric=kmeans_lib.KMeansClustering.COSINE_DISTANCE, use_mini_batch=self.use_mini_batch, mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration, config=self.config(3))
def _kmeans(self, relative_tolerance=None): return kmeans_lib.KMeansClustering( self.num_centers, initial_clusters=self.initial_clusters, distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE, use_mini_batch=self.use_mini_batch, mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration, random_seed=24, relative_tolerance=relative_tolerance)
def test_kmeans_plus_plus_batch_too_small(self): points = np.array( [[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]], dtype=np.float32) kmeans = kmeans_lib.KMeansClustering( num_clusters=points.shape[0], initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT, distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE, use_mini_batch=True, mini_batch_steps_per_iteration=100, random_seed=24, relative_tolerance=None) with self.assertRaisesOpError(AssertionError): kmeans.train( input_fn=self.input_fn(batch_size=4, points=points, randomize=False), steps=1)
def test_kmeans_plus_plus_batch_just_right(self): points = np.array([[1, 2]], dtype=np.float32) kmeans = kmeans_lib.KMeansClustering( num_clusters=points.shape[0], initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT, distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE, use_mini_batch=True, mini_batch_steps_per_iteration=100, random_seed=24, relative_tolerance=None) kmeans.train( input_fn=self.input_fn(batch_size=1, points=points, randomize=False), steps=1) clusters = kmeans.cluster_centers() self.assertAllEqual(points, clusters)
def test_predict_kmeans_plus_plus(self): # Most points are concetrated near one center. KMeans++ is likely to find # the less populated centers. points = np.array([[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3], [-3.1, -3.2], [-2.8, -3.], [-2.9, -3.1], [-3., -3.1], [-3., -3.1], [-3.2, -3.], [-3., -3.]], dtype=np.float32) true_centers = np.array([ normalize(np.mean(normalize(points)[0:2, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[2:4, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0] ], dtype=np.float32) true_assignments = [0] * 2 + [1] * 2 + [2] * 8 true_score = len(points) - np.tensordot(normalize(points), true_centers[true_assignments]) kmeans = kmeans_lib.KMeansClustering( 3, initial_clusters=self.initial_clusters, distance_metric=kmeans_lib.KMeansClustering.COSINE_DISTANCE, use_mini_batch=self.use_mini_batch, mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration, config=self.config(3)) kmeans.train(input_fn=lambda: (constant_op.constant(points), None), steps=30) centers = normalize(kmeans.cluster_centers()) self.assertAllClose(sorted(centers.tolist()), sorted(true_centers.tolist()), atol=1e-2) def _input_fn(): return (input_lib.limit_epochs(constant_op.constant(points), num_epochs=1), None) assignments = list(kmeans.predict_cluster_index(input_fn=_input_fn)) self.assertAllClose(centers[assignments], true_centers[true_assignments], atol=1e-2) score = kmeans.score( input_fn=lambda: (constant_op.constant(points), None)) self.assertAllClose(score, true_score, atol=1e-2)
def _fit(self, num_iters=10): scores = [] start = time.time() for i in range(num_iters): print('Starting tensorflow KMeans: %d' % i) tf_kmeans = kmeans_lib.KMeansClustering( self.num_clusters, initial_clusters=kmeans_lib.KMeansClustering.KMEANS_PLUS_PLUS_INIT, kmeans_plus_plus_num_retries=int(math.log(self.num_clusters) + 2), random_seed=i * 42, relative_tolerance=1e-6, config=self.config(3)) tf_kmeans.train( input_fn=lambda: (constant_op.constant(self.points), None), steps=50) _ = tf_kmeans.cluster_centers() scores.append( tf_kmeans.score( input_fn=lambda: (constant_op.constant(self.points), None))) self._report(num_iters, start, time.time(), scores)
def test_monitor(self): if self.use_mini_batch: # We don't test for use_mini_batch case since the loss value can be noisy. return kmeans = kmeans_lib.KMeansClustering( self.num_centers, initial_clusters=self.initial_clusters, distance_metric=kmeans_lib.KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE, use_mini_batch=self.use_mini_batch, mini_batch_steps_per_iteration=self.mini_batch_steps_per_iteration, config=self.config(14), random_seed=12, relative_tolerance=1e-4) kmeans.train( input_fn=self.input_fn(), # Force it to train until the relative tolerance monitor stops it. steps=None) score = kmeans.score(input_fn=self.input_fn(batch_size=self.num_points)) self.assertNear(self.true_score, score, self.true_score * 0.01)
def test_queues(self): kmeans = kmeans_lib.KMeansClustering(5) kmeans.train(input_fn=self.input_fn(), steps=1)
import numpy as np import tensorflow as tf import matplotlib import matplotlib.pyplot as plt from tensorflow.contrib.factorization.python.ops import kmeans def input_fn_1D(input_1D_): input_t = tf.convert_to_tensor(input_1D_, dtype=tf.float32) input_t = tf.expand_dims(input_t, 1) return(input_t, None) input_1D = np.array([1,2,3.0,4,5,126,21,33,6,73.0,2,3,56,98,100,4,8,33,102]) k_means_estimator = kmeans.KMeansClustering(num_clusters=2) fit = k_means_estimator.train(input_fn=lambda: input_fn_1D(input_1D), steps=1000) clusters_1D = k_means_estimator.cluster_centers() fig = plt.figure() ax1 = fig.add_subplot(111) ax1.scatter(input_1D, np.zeros_like(input_1D), s=300, marker='o') ax1.scatter(clusters_1D, np.zeros_like(clusters_1D), c='r', s=200, marker='s') plt.show() for var in fit.get_variable_names(): print(var, "-----> ", fit.get_variable_value(var))