def test_transform_with_cosine_distance(self): points = np.array([[2.5, 0.1], [2, 0.2], [3, 0.1], [4, 0.2], [0.1, 2.5], [0.2, 2], [0.1, 3], [0.2, 4]], dtype=np.float32) true_centers = [ normalize(np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[0:4, :], axis=0, keepdims=True))[0] ] kmeans = KMeans(2, initial_clusters=kmeans_ops.RANDOM_INIT, distance_metric=kmeans_ops.COSINE_DISTANCE, use_mini_batch=self.use_mini_batch, config=self.config(5)) kmeans.fit(x=points, steps=50, batch_size=8) centers = normalize(kmeans.clusters()) self.assertAllClose(np.sort(centers, axis=0), np.sort(true_centers, axis=0), atol=1e-2) true_transform = 1 - cosine_similarity(points, centers) transform = kmeans.transform(points, batch_size=8) self.assertAllClose(transform, true_transform, atol=1e-3)
def test_transform_with_cosine_distance(self): points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18], [-2.5, -3.5], [-2, -8], [-3, -1], [-3, -18]]) true_centers = [normalize(np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[0:4, :], axis=0, keepdims=True))[0]] kmeans = KMeans(2, initial_clusters=kmeans_ops.RANDOM_INIT, distance_metric=kmeans_ops.COSINE_DISTANCE, use_mini_batch=self.use_mini_batch, batch_size=8, continue_training=True, config=run_config.RunConfig(tf_random_seed=3)) kmeans.fit(x=points, steps=30) centers = normalize(kmeans.clusters()) self.assertAllClose(np.sort(centers, axis=0), np.sort(true_centers, axis=0), atol=1e-2) true_transform = 1 - cosine_similarity(points, centers) transform = kmeans.transform(points) self.assertAllClose(transform, true_transform, atol=1e-3)
def test_transform_with_cosine_distance(self): points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18], [-2.5, -3.5], [-2, -8], [-3, -1], [-3, -18]]) true_centers = [ normalize(np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[0:4, :], axis=0, keepdims=True))[0] ] kmeans = KMeans(2, initial_clusters=kmeans_ops.RANDOM_INIT, distance_metric=kmeans_ops.COSINE_DISTANCE, use_mini_batch=self.use_mini_batch, batch_size=8, continue_training=True, config=run_config.RunConfig(tf_random_seed=3)) kmeans.fit(x=points, steps=30) centers = normalize(kmeans.clusters()) self.assertAllClose(np.sort(centers, axis=0), np.sort(true_centers, axis=0), atol=1e-2) true_transform = 1 - cosine_similarity(points, centers) transform = kmeans.transform(points) self.assertAllClose(transform, true_transform, atol=1e-3)
def test_predict_with_cosine_distance_and_kmeans_plus_plus(self): # Most points are concetrated near one center. KMeans++ is likely to find # the less populated centers. points = np.array([[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3], [-3.1, -3.2], [-2.8, -3.], [-2.9, -3.1], [-3., -3.1], [-3., -3.1], [-3.2, -3.], [-3., -3.]]).astype(np.float32) true_centers = np.array( [normalize(np.mean(normalize(points)[0:2, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[2:4, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0]]) true_assignments = [0] * 2 + [1] * 2 + [2] * 8 true_score = len(points) - np.tensordot(normalize(points), true_centers[true_assignments]) kmeans = KMeans(3, initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT, distance_metric=kmeans_ops.COSINE_DISTANCE, use_mini_batch=self.use_mini_batch, config=self.config(3)) kmeans.fit(x=points, steps=30, batch_size=12) centers = normalize(kmeans.clusters()) self.assertAllClose(sorted(centers.tolist()), sorted(true_centers.tolist()), atol=1e-2) assignments = kmeans.predict(points, batch_size=12) self.assertAllClose(centers[assignments], true_centers[true_assignments], atol=1e-2) score = kmeans.score(points, batch_size=12) self.assertAllClose(score, true_score, atol=1e-2)
def test_transform_with_cosine_distance(self): points = np.array( [[2.5, 0.1], [2, 0.2], [3, 0.1], [4, 0.2], [0.1, 2.5], [0.2, 2], [0.1, 3], [0.2, 4]], dtype=np.float32) true_centers = [normalize(np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[0:4, :], axis=0, keepdims=True))[0]] kmeans = KMeans(2, initial_clusters=kmeans_ops.RANDOM_INIT, distance_metric=kmeans_ops.COSINE_DISTANCE, use_mini_batch=self.use_mini_batch, config=self.config(5)) kmeans.fit(x=points, steps=50, batch_size=8) centers = normalize(kmeans.clusters()) self.assertAllClose(np.sort(centers, axis=0), np.sort(true_centers, axis=0), atol=1e-2) true_transform = 1 - cosine_similarity(points, centers) transform = kmeans.transform(points, batch_size=8) self.assertAllClose(transform, true_transform, atol=1e-3)
def test_predict_with_cosine_distance(self): points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18], [-2.5, -3.5], [-2, -8], [-3, -1], [-3, -18]]).astype( np.float32) true_centers = np.array( [normalize(np.mean(normalize(points)[0:4, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0]]) true_assignments = [0] * 4 + [1] * 4 true_score = len(points) - np.tensordot(normalize(points), true_centers[true_assignments]) kmeans = KMeans(2, initial_clusters=kmeans_ops.RANDOM_INIT, distance_metric=kmeans_ops.COSINE_DISTANCE, use_mini_batch=self.use_mini_batch, config=self.config(3)) kmeans.fit(x=points, steps=30, batch_size=8) centers = normalize(kmeans.clusters()) self.assertAllClose(np.sort(centers, axis=0), np.sort(true_centers, axis=0), atol=1e-2) assignments = kmeans.predict(points, batch_size=8) self.assertAllClose(centers[assignments], true_centers[true_assignments], atol=1e-2) score = kmeans.score(points, batch_size=8) self.assertAllClose(score, true_score, atol=1e-2)
def test_predict_with_cosine_distance(self): points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18], [-2.5, -3.5], [-2, -8], [-3, -1], [-3, -18]]).astype(np.float32) true_centers = np.array([ normalize(np.mean(normalize(points)[0:4, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0] ]) true_assignments = [0] * 4 + [1] * 4 true_score = len(points) - np.tensordot(normalize(points), true_centers[true_assignments]) kmeans = KMeans(2, initial_clusters=kmeans_ops.RANDOM_INIT, distance_metric=kmeans_ops.COSINE_DISTANCE, use_mini_batch=self.use_mini_batch, config=self.config(3)) kmeans.fit(x=points, steps=30, batch_size=8) centers = normalize(kmeans.clusters()) self.assertAllClose(np.sort(centers, axis=0), np.sort(true_centers, axis=0), atol=1e-2) assignments = kmeans.predict(points, batch_size=8) self.assertAllClose(centers[assignments], true_centers[true_assignments], atol=1e-2) score = kmeans.score(points, batch_size=8) self.assertAllClose(score, true_score, atol=1e-2)
def test_fit_raise_if_num_clusters_larger_than_num_points_kmeans_plus_plus( self): points = np.array([[2.0, 3.0], [1.6, 8.2]]) with self.assertRaisesOpError(AssertionError): kmeans = KMeans(num_clusters=3, initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT) kmeans.fit(x=points, steps=10, batch_size=8)
def test_fit_raise_if_num_clusters_larger_than_num_points_random_init( self): points = np.array([[2.0, 3.0], [1.6, 8.2]]) with self.assertRaisesOpError('less'): kmeans = KMeans(num_clusters=3, initial_clusters=kmeans_ops.RANDOM_INIT) kmeans.fit(x=points, steps=10, batch_size=8)
def test_fit_with_cosine_distance(self): # Create points on y=x and y=1.5x lines to check the cosine similarity. # Note that euclidean distance will give different results in this case. points = np.array([[9, 9], [0.5, 0.5], [10, 15], [0.4, 0.6]]) # true centers are the unit vectors on lines y=x and y=1.5x true_centers = np.array([[0.70710678, 0.70710678], [0.5547002, 0.83205029]]) kmeans = KMeans(2, initial_clusters=kmeans_ops.RANDOM_INIT, distance_metric=kmeans_ops.COSINE_DISTANCE, use_mini_batch=self.use_mini_batch, config=self.config(2), random_seed=12) kmeans.fit(x=points, steps=10, batch_size=4) centers = normalize(kmeans.clusters()) self.assertAllClose(np.sort(centers, axis=0), np.sort(true_centers, axis=0))
def _fit(self, num_iters=10): scores = [] start = time.time() for i in range(num_iters): print('Starting tensorflow KMeans: %d' % i) tf_kmeans = KMeans(self.num_clusters, initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT, kmeans_plus_plus_num_retries=int( math.log(self.num_clusters) + 2), random_seed=i * 42, config=run_config.RunConfig(tf_random_seed=3)) tf_kmeans.fit(x=self.points, batch_size=self.num_points, steps=50, relative_tolerance=1e-6) _ = tf_kmeans.clusters() scores.append(tf_kmeans.score(self.points)) self._report(num_iters, start, time.time(), scores)
def setUp(self): np.random.seed(3) tf.set_random_seed(2) self.num_centers = 2 self.num_dims = 2 self.num_points = 4000 self.batch_size = 100 self.true_centers = self.make_random_centers(self.num_centers, self.num_dims) self.points, self.assignments, self.scores = self.make_random_points( self.true_centers, self.num_points) self.true_score = np.add.reduce(self.scores) # Use initial means from kmeans (just like scikit-learn does). clusterer = KMeans(num_clusters=self.num_centers) clusterer.fit(self.points, steps=30) self.initial_means = clusterer.clusters()
def test_monitor(self): if self.batch_size != self.num_points: # TODO(agarwal): Doesn't work with mini-batch. return kmeans = KMeans(self.num_centers, initial_clusters=kmeans_ops.RANDOM_INIT, use_mini_batch=self.use_mini_batch, config=run_config.RunConfig(tf_random_seed=14), random_seed=12) kmeans.fit(x=self.points, # Force it to train forever until the monitor stops it. steps=None, batch_size=self.batch_size, relative_tolerance=1e-4) score = kmeans.score(x=self.points) self.assertNear(self.true_score, score, self.true_score * 0.005)
def test_predict_with_cosine_distance_and_kmeans_plus_plus(self): # Most points are concetrated near one center. KMeans++ is likely to find # the less populated centers. points = np.array([[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3], [-3.1, -3.2], [-2.8, -3.], [-2.9, -3.1], [-3., -3.1], [-3., -3.1], [-3.2, -3.], [-3., -3.]]).astype(np.float32) true_centers = np.array([ normalize(np.mean(normalize(points)[0:2, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[2:4, :], axis=0, keepdims=True))[0], normalize(np.mean(normalize(points)[4:, :], axis=0, keepdims=True))[0] ]) true_assignments = [0] * 2 + [1] * 2 + [2] * 8 true_score = len(points) - np.tensordot(normalize(points), true_centers[true_assignments]) kmeans = KMeans(3, initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT, distance_metric=kmeans_ops.COSINE_DISTANCE, use_mini_batch=self.use_mini_batch, batch_size=12, continue_training=True, config=run_config.RunConfig(tf_random_seed=3)) kmeans.fit(x=points, steps=30) centers = normalize(kmeans.clusters()) self.assertAllClose(sorted(centers.tolist()), sorted(true_centers.tolist()), atol=1e-2) assignments = kmeans.predict(points) self.assertAllClose(centers[assignments], true_centers[true_assignments], atol=1e-2) score = kmeans.score(points) self.assertAllClose(score, true_score, atol=1e-2)
def test_fit_raise_if_num_clusters_larger_than_num_points_random_init(self): points = np.array([[2.0, 3.0], [1.6, 8.2]]) with self.assertRaisesOpError('less'): kmeans = KMeans(num_clusters=3, initial_clusters=kmeans_ops.RANDOM_INIT) kmeans.fit(x=points, steps=10, batch_size=8)
num_centers = 6 num_dims = 2 num_points = 10000 true_centers = make_random_centers(num_centers,num_dims) points, _, scores = make_random_points(true_centers,num_points) from tensorflow.contrib.factorization.python.ops import kmeans as kmeans_ops from tensorflow.contrib.factorization.python.ops.kmeans import \ KMeansClustering as KMeans kmeans = KMeans(num_centers=num_centers, initial_clusters = kmeans_ops.RANDOM_INIT, use_mini_batch=False, config=RunConfig(tf_random_seed=14), random_seed=12) kmeans.fit(x=points,steps=10,batch_size=8) clusters = kmeans.clusters() kmeans.predict(points,batch_size=128) kmeans.score(points,batch_size=128) kmeans.transform(points,batch_size=128) #################################### #支持向量机 def input_fn(): return{ 'example_id':tf.constant(['1','2','3']), 'feature1':tf.constant([[0.0],[1.0],[3.0]]), 'feature2':tf.constant([[1.0],[-1.2],[1.0]]), },tf.constant([1],[0],[1])