def test_assignment(): X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K) # Test change - new closest clusters are [0,0,1] - see test_closest_cluster centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]] mask_centroids = [[0, 1, 1], [1, 1, 0]] cluster_assignments = [0, 1, 1] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.cluster_assignments = cluster_assignments change = kmeans.assignment() assert change == True assert numpy.array_equal([0, 0, 1], kmeans.cluster_assignments) assert numpy.array_equal([[0, 1], [2]], kmeans.data_point_assignments) # Test no change centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]] mask_centroids = [[0, 1, 1], [1, 1, 0]] cluster_assignments = [0, 0, 1] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.cluster_assignments = cluster_assignments change = kmeans.assignment() assert change == False assert numpy.array_equal([0, 0, 1], kmeans.cluster_assignments) assert numpy.array_equal([[0, 1], [2]], kmeans.data_point_assignments)
def test_assignment(): X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K) # Test change - new closest clusters are [0,0,1] - see test_closest_cluster centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]] mask_centroids = [[0,1,1],[1,1,0]] cluster_assignments = [0,1,1] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.cluster_assignments = cluster_assignments change = kmeans.assignment() assert change == True assert numpy.array_equal([0,0,1],kmeans.cluster_assignments) assert numpy.array_equal([[0,1],[2]],kmeans.data_point_assignments) # Test no change centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]] mask_centroids = [[0,1,1],[1,1,0]] cluster_assignments = [0,0,1] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.cluster_assignments = cluster_assignments change = kmeans.assignment() assert change == False assert numpy.array_equal([0,0,1],kmeans.cluster_assignments) assert numpy.array_equal([[0,1],[2]],kmeans.data_point_assignments)
def test_closest_cluster(): X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K) # Equal distance for point 0 centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]] mask_centroids = [[0,1,1],[1,1,0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids expected_closest_cluster_0 = 0 # MSE = 1.0 vs 1.0 expected_closest_cluster_1 = 0 # MSE = 4.0 vs 16.0 expected_closest_cluster_2 = 1 # MSE = 44.5 vs 37.0 closest_cluster_0 = kmeans.closest_cluster(X[0],0,M[0]) closest_cluster_1 = kmeans.closest_cluster(X[1],1,M[1]) closest_cluster_2 = kmeans.closest_cluster(X[2],2,M[2]) assert expected_closest_cluster_0 == closest_cluster_0 assert expected_closest_cluster_1 == closest_cluster_1 assert expected_closest_cluster_2 == closest_cluster_2 # Also test whether the distances are set correctly expected_distances = [1.0,4.0,37.0] distances = kmeans.distances assert numpy.array_equal(expected_distances,distances) # Test when all MSEs return None (impossible but still testing behaviour) centroids = numpy.ones((2,3)) mask_centroids = [[0,0,1],[0,0,0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids expected_closest_cluster = 1 closest_cluster = kmeans.closest_cluster(X[0],0,M[0]) assert expected_closest_cluster == closest_cluster
def test_closest_cluster(): X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K) # Equal distance for point 0 centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]] mask_centroids = [[0, 1, 1], [1, 1, 0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids expected_closest_cluster_0 = 0 # MSE = 1.0 vs 1.0 expected_closest_cluster_1 = 0 # MSE = 4.0 vs 16.0 expected_closest_cluster_2 = 1 # MSE = 44.5 vs 37.0 closest_cluster_0 = kmeans.closest_cluster(X[0], 0, M[0]) closest_cluster_1 = kmeans.closest_cluster(X[1], 1, M[1]) closest_cluster_2 = kmeans.closest_cluster(X[2], 2, M[2]) assert expected_closest_cluster_0 == closest_cluster_0 assert expected_closest_cluster_1 == closest_cluster_1 assert expected_closest_cluster_2 == closest_cluster_2 # Also test whether the distances are set correctly expected_distances = [1.0, 4.0, 37.0] distances = kmeans.distances assert numpy.array_equal(expected_distances, distances) # Test when all MSEs return None (impossible but still testing behaviour) centroids = numpy.ones((2, 3)) mask_centroids = [[0, 0, 1], [0, 0, 0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids expected_closest_cluster = 1 closest_cluster = kmeans.closest_cluster(X[0], 0, M[0]) assert expected_closest_cluster == closest_cluster
def test_find_point_furthest_away(): X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K) # Equal distance for point 0 centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]] mask_centroids = [[0,1,1],[1,1,0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.closest_cluster(X[0],0,M[0]) # MSE = 1.0 vs 1.0 kmeans.closest_cluster(X[1],1,M[1]) # MSE = 4.0 vs 16.0 kmeans.closest_cluster(X[2],2,M[2]) # MSE = 44.5 vs 37.0 expected_furthest_away = 2 furthest_away = kmeans.find_point_furthest_away() assert expected_furthest_away == furthest_away
def test_find_point_furthest_away(): X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K) # Equal distance for point 0 centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]] mask_centroids = [[0, 1, 1], [1, 1, 0]] kmeans.centroids = centroids kmeans.mask_centroids = mask_centroids kmeans.closest_cluster(X[0], 0, M[0]) # MSE = 1.0 vs 1.0 kmeans.closest_cluster(X[1], 1, M[1]) # MSE = 4.0 vs 16.0 kmeans.closest_cluster(X[2], 2, M[2]) # MSE = 44.5 vs 37.0 expected_furthest_away = 2 furthest_away = kmeans.find_point_furthest_away() assert expected_furthest_away == furthest_away
def test_update(): # Normal case X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K) kmeans.data_point_assignments = numpy.array([[0,1],[2]]) #points 0,1 to cluster 0, point 2 to cluster 1 kmeans.centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]] kmeans.mask_centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]] new_centroids = [[1.0,3.5,0],[7.0,8.0,9.0]] new_mask_centroids = [[1,1,0],[1,1,1]] kmeans.update() assert numpy.array_equal(new_centroids,kmeans.centroids) assert numpy.array_equal(new_mask_centroids,kmeans.mask_centroids) # Case when one cluster has no points assigned to it - we then randomly re-initialise that cluster X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 2 kmeans = KMeans(X,M,K,'random') kmeans.data_point_assignments = numpy.array([[0,1,2],[]]) #points 0,1,2 to cluster 0, none to cluster 1 kmeans.centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]] kmeans.mask_centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]] kmeans.mins = [1.0,2.0,9.0] kmeans.maxs = [7.0,8.0,9.0] new_centroids = [[4.0,5.0,9.0],[6.066531109150288,6.547726417641815,9.0]] new_mask_centroids = [[1,1,1],[1,1,1]] random.seed(0) kmeans.update() assert numpy.array_equal(new_centroids,kmeans.centroids) assert numpy.array_equal(new_mask_centroids,kmeans.mask_centroids) # Case when we use the 'singleton' option for empty clusters - reassign point furthest away to the cluster # Points 0 and 1 go to cluster 0, 2 to cluster 1 and none to cluster 2. # Point 2 is furthest away, so gets reassigned to cluster 2 - making # cluster 1 empty. Then point 1 is furthest away and gets reassigned to cluster 1 X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]]) M = numpy.array([[1,1,0],[0,1,0],[1,1,1]]) K = 3 kmeans = KMeans(X,M,K,resolve_empty='singleton') kmeans.data_point_assignments = numpy.array([[0,1],[2],[]]) #points 0,1 to cluster 0, 2 to cluster 1, none to cluster 2 kmeans.cluster_assignments = [0,0,1] kmeans.centroids = [[1.0,2.0,3.0],[15.0,16.0,17.0],[500.0,500.0,500.0]] kmeans.mask_centroids = [[1,1,0],[1,1,1],[1,1,1]] kmeans.distances = numpy.array([ kmeans.compute_MSE(kmeans.X[0],kmeans.centroids[0],M[0],kmeans.mask_centroids[0]), kmeans.compute_MSE(kmeans.X[1],kmeans.centroids[0],M[1],kmeans.mask_centroids[0]), kmeans.compute_MSE(kmeans.X[2],kmeans.centroids[1],M[2],kmeans.mask_centroids[1]) ]) kmeans.mins = [1.0,2.0,9.0] kmeans.maxs = [7.0,8.0,9.0] new_centroids = [[1.0,2.0,0],[4.0,5.0,6.0],[7.0,8.0,9.0]] new_data_point_assignments = [[0],[1],[2]] new_distances = [0,0,0] kmeans.update() assert new_data_point_assignments == list(kmeans.data_point_assignments) assert numpy.array_equal(new_distances,kmeans.distances) assert numpy.array_equal(new_centroids,kmeans.centroids)
def test_cluster(): ### No missing values case. # Points 1,2 will first go to cluster 2, and point 3 to cluster 1. # Then point 1 will switch to cluster 1. X = [[2,5],[7,5],[2,3]] M = numpy.ones((3,2)) K = 2 kmeans = KMeans(X,M,K) kmeans.centroids = [[2.0,2.0],[4.0,5.0]] kmeans.mask_centroids = numpy.ones((2,2)) kmeans.cluster_assignments = [-1,-1,-1] expected_centroids = [[2.0,4.0],[7.0,5.0]] expected_cluster_assignments = [0,1,0] expected_data_point_assignments = [[0,2],[1]] expected_clustering_results = [[1,0],[0,1],[1,0]] kmeans.cluster() assert numpy.array_equal(expected_centroids,kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments,kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments,kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results,kmeans.clustering_results) ### Missing values case. # Points 2,3,4 will first go to cluster 2, and point 1 to cluster 1. # Then point 2 will switch to cluster 1. X = [[2,5],[3,-1],[10,1],[-1,2]] M = [[1,1],[1,0],[1,1],[0,1]] K = 2 kmeans = KMeans(X,M,K) kmeans.centroids = [[2.0,7.0],[3.0,2.0]] kmeans.mask_centroids = numpy.ones((2,2)) kmeans.cluster_assignments = [-1,-1,-1,-1] expected_centroids = [[2.5,5.0],[10.0,1.5]] expected_cluster_assignments = [0,0,1,1] expected_data_point_assignments = [[0,1],[2,3]] expected_clustering_results = [[1,0],[1,0],[0,1],[0,1]] kmeans.cluster() assert numpy.array_equal(expected_centroids,kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments,kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments,kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results,kmeans.clustering_results) ### Cluster with 0 coordinate. # Cluster 1 gets points 1 and 2, cluster 2 gets 3 and 4. X = [[2,5],[3,-1],[-1,1],[-1,2]] M = [[1,1],[1,0],[0,1],[0,1]] K = 2 kmeans = KMeans(X,M,K) kmeans.centroids = [[2.0,7.0],[4.0,4.0]] kmeans.mask_centroids = numpy.ones((2,2)) kmeans.cluster_assignments = [-1,-1,-1,-1] expected_centroids = [[2.5,5.0],[0,1.5]] expected_cluster_assignments = [0,0,1,1] expected_data_point_assignments = [[0,1],[2,3]] expected_clustering_results = [[1,0],[1,0],[0,1],[0,1]] kmeans.cluster() assert numpy.array_equal(expected_centroids,kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments,kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments,kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results,kmeans.clustering_results)
def test_update(): # Normal case X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K) kmeans.data_point_assignments = numpy.array( [[0, 1], [2]]) #points 0,1 to cluster 0, point 2 to cluster 1 kmeans.centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] kmeans.mask_centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] new_centroids = [[1.0, 3.5, 0], [7.0, 8.0, 9.0]] new_mask_centroids = [[1, 1, 0], [1, 1, 1]] kmeans.update() assert numpy.array_equal(new_centroids, kmeans.centroids) assert numpy.array_equal(new_mask_centroids, kmeans.mask_centroids) # Case when one cluster has no points assigned to it - we then randomly re-initialise that cluster X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 2 kmeans = KMeans(X, M, K, 'random') kmeans.data_point_assignments = numpy.array( [[0, 1, 2], []]) #points 0,1,2 to cluster 0, none to cluster 1 kmeans.centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] kmeans.mask_centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] kmeans.mins = [1.0, 2.0, 9.0] kmeans.maxs = [7.0, 8.0, 9.0] new_centroids = [[4.0, 5.0, 9.0], [6.066531109150288, 6.547726417641815, 9.0]] new_mask_centroids = [[1, 1, 1], [1, 1, 1]] random.seed(0) kmeans.update() assert numpy.array_equal(new_centroids, kmeans.centroids) assert numpy.array_equal(new_mask_centroids, kmeans.mask_centroids) # Case when we use the 'singleton' option for empty clusters - reassign point furthest away to the cluster # Points 0 and 1 go to cluster 0, 2 to cluster 1 and none to cluster 2. # Point 2 is furthest away, so gets reassigned to cluster 2 - making # cluster 1 empty. Then point 1 is furthest away and gets reassigned to cluster 1 X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]]) K = 3 kmeans = KMeans(X, M, K, resolve_empty='singleton') kmeans.data_point_assignments = numpy.array( [[0, 1], [2], []]) #points 0,1 to cluster 0, 2 to cluster 1, none to cluster 2 kmeans.cluster_assignments = [0, 0, 1] kmeans.centroids = [[1.0, 2.0, 3.0], [15.0, 16.0, 17.0], [500.0, 500.0, 500.0]] kmeans.mask_centroids = [[1, 1, 0], [1, 1, 1], [1, 1, 1]] kmeans.distances = numpy.array([ kmeans.compute_MSE(kmeans.X[0], kmeans.centroids[0], M[0], kmeans.mask_centroids[0]), kmeans.compute_MSE(kmeans.X[1], kmeans.centroids[0], M[1], kmeans.mask_centroids[0]), kmeans.compute_MSE(kmeans.X[2], kmeans.centroids[1], M[2], kmeans.mask_centroids[1]) ]) kmeans.mins = [1.0, 2.0, 9.0] kmeans.maxs = [7.0, 8.0, 9.0] new_centroids = [[1.0, 2.0, 0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] new_data_point_assignments = [[0], [1], [2]] new_distances = [0, 0, 0] kmeans.update() assert new_data_point_assignments == list(kmeans.data_point_assignments) assert numpy.array_equal(new_distances, kmeans.distances) assert numpy.array_equal(new_centroids, kmeans.centroids)
def test_cluster(): ### No missing values case. # Points 1,2 will first go to cluster 2, and point 3 to cluster 1. # Then point 1 will switch to cluster 1. X = [[2, 5], [7, 5], [2, 3]] M = numpy.ones((3, 2)) K = 2 kmeans = KMeans(X, M, K) kmeans.centroids = [[2.0, 2.0], [4.0, 5.0]] kmeans.mask_centroids = numpy.ones((2, 2)) kmeans.cluster_assignments = [-1, -1, -1] expected_centroids = [[2.0, 4.0], [7.0, 5.0]] expected_cluster_assignments = [0, 1, 0] expected_data_point_assignments = [[0, 2], [1]] expected_clustering_results = [[1, 0], [0, 1], [1, 0]] kmeans.cluster() assert numpy.array_equal(expected_centroids, kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments, kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments, kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results, kmeans.clustering_results) ### Missing values case. # Points 2,3,4 will first go to cluster 2, and point 1 to cluster 1. # Then point 2 will switch to cluster 1. X = [[2, 5], [3, -1], [10, 1], [-1, 2]] M = [[1, 1], [1, 0], [1, 1], [0, 1]] K = 2 kmeans = KMeans(X, M, K) kmeans.centroids = [[2.0, 7.0], [3.0, 2.0]] kmeans.mask_centroids = numpy.ones((2, 2)) kmeans.cluster_assignments = [-1, -1, -1, -1] expected_centroids = [[2.5, 5.0], [10.0, 1.5]] expected_cluster_assignments = [0, 0, 1, 1] expected_data_point_assignments = [[0, 1], [2, 3]] expected_clustering_results = [[1, 0], [1, 0], [0, 1], [0, 1]] kmeans.cluster() assert numpy.array_equal(expected_centroids, kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments, kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments, kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results, kmeans.clustering_results) ### Cluster with 0 coordinate. # Cluster 1 gets points 1 and 2, cluster 2 gets 3 and 4. X = [[2, 5], [3, -1], [-1, 1], [-1, 2]] M = [[1, 1], [1, 0], [0, 1], [0, 1]] K = 2 kmeans = KMeans(X, M, K) kmeans.centroids = [[2.0, 7.0], [4.0, 4.0]] kmeans.mask_centroids = numpy.ones((2, 2)) kmeans.cluster_assignments = [-1, -1, -1, -1] expected_centroids = [[2.5, 5.0], [0, 1.5]] expected_cluster_assignments = [0, 0, 1, 1] expected_data_point_assignments = [[0, 1], [2, 3]] expected_clustering_results = [[1, 0], [1, 0], [0, 1], [0, 1]] kmeans.cluster() assert numpy.array_equal(expected_centroids, kmeans.centroids) assert numpy.array_equal(expected_cluster_assignments, kmeans.cluster_assignments) assert numpy.array_equal(expected_data_point_assignments, kmeans.data_point_assignments) assert numpy.array_equal(expected_clustering_results, kmeans.clustering_results)