def test_assignment():
    X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
    M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]])
    K = 2
    kmeans = KMeans(X, M, K)

    # Test change - new closest clusters are [0,0,1] - see test_closest_cluster
    centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]]
    mask_centroids = [[0, 1, 1], [1, 1, 0]]
    cluster_assignments = [0, 1, 1]
    kmeans.centroids = centroids
    kmeans.mask_centroids = mask_centroids
    kmeans.cluster_assignments = cluster_assignments

    change = kmeans.assignment()
    assert change == True
    assert numpy.array_equal([0, 0, 1], kmeans.cluster_assignments)
    assert numpy.array_equal([[0, 1], [2]], kmeans.data_point_assignments)

    # Test no change
    centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]]
    mask_centroids = [[0, 1, 1], [1, 1, 0]]
    cluster_assignments = [0, 0, 1]
    kmeans.centroids = centroids
    kmeans.mask_centroids = mask_centroids
    kmeans.cluster_assignments = cluster_assignments

    change = kmeans.assignment()
    assert change == False
    assert numpy.array_equal([0, 0, 1], kmeans.cluster_assignments)
    assert numpy.array_equal([[0, 1], [2]], kmeans.data_point_assignments)
def test_assignment():
    X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]])
    M = numpy.array([[1,1,0],[0,1,0],[1,1,1]])
    K = 2
    kmeans = KMeans(X,M,K)
    
    # Test change - new closest clusters are [0,0,1] - see test_closest_cluster
    centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]]
    mask_centroids = [[0,1,1],[1,1,0]] 
    cluster_assignments = [0,1,1]
    kmeans.centroids = centroids
    kmeans.mask_centroids = mask_centroids
    kmeans.cluster_assignments = cluster_assignments
    
    change = kmeans.assignment()
    assert change == True
    assert numpy.array_equal([0,0,1],kmeans.cluster_assignments)
    assert numpy.array_equal([[0,1],[2]],kmeans.data_point_assignments)
    
    # Test no change
    centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]]
    mask_centroids = [[0,1,1],[1,1,0]] 
    cluster_assignments = [0,0,1]
    kmeans.centroids = centroids
    kmeans.mask_centroids = mask_centroids
    kmeans.cluster_assignments = cluster_assignments
    
    change = kmeans.assignment()
    assert change == False
    assert numpy.array_equal([0,0,1],kmeans.cluster_assignments)
    assert numpy.array_equal([[0,1],[2]],kmeans.data_point_assignments)
def test_closest_cluster():
    X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]])
    M = numpy.array([[1,1,0],[0,1,0],[1,1,1]])
    K = 2
    kmeans = KMeans(X,M,K)
    
    # Equal distance for point 0
    centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]]
    mask_centroids = [[0,1,1],[1,1,0]] 
    kmeans.centroids = centroids
    kmeans.mask_centroids = mask_centroids
    
    expected_closest_cluster_0 = 0 # MSE = 1.0 vs 1.0
    expected_closest_cluster_1 = 0 # MSE = 4.0 vs 16.0
    expected_closest_cluster_2 = 1 # MSE = 44.5 vs 37.0
    closest_cluster_0 = kmeans.closest_cluster(X[0],0,M[0])
    closest_cluster_1 = kmeans.closest_cluster(X[1],1,M[1])
    closest_cluster_2 = kmeans.closest_cluster(X[2],2,M[2])
    
    assert expected_closest_cluster_0 == closest_cluster_0
    assert expected_closest_cluster_1 == closest_cluster_1
    assert expected_closest_cluster_2 == closest_cluster_2
    
    # Also test whether the distances are set correctly
    expected_distances = [1.0,4.0,37.0]
    distances = kmeans.distances
    assert numpy.array_equal(expected_distances,distances)
    
    # Test when all MSEs return None (impossible but still testing behaviour)
    centroids = numpy.ones((2,3))
    mask_centroids = [[0,0,1],[0,0,0]]
    kmeans.centroids = centroids
    kmeans.mask_centroids = mask_centroids
    
    expected_closest_cluster = 1
    closest_cluster = kmeans.closest_cluster(X[0],0,M[0])
    assert expected_closest_cluster == closest_cluster
def test_closest_cluster():
    X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
    M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]])
    K = 2
    kmeans = KMeans(X, M, K)

    # Equal distance for point 0
    centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]]
    mask_centroids = [[0, 1, 1], [1, 1, 0]]
    kmeans.centroids = centroids
    kmeans.mask_centroids = mask_centroids

    expected_closest_cluster_0 = 0  # MSE = 1.0 vs 1.0
    expected_closest_cluster_1 = 0  # MSE = 4.0 vs 16.0
    expected_closest_cluster_2 = 1  # MSE = 44.5 vs 37.0
    closest_cluster_0 = kmeans.closest_cluster(X[0], 0, M[0])
    closest_cluster_1 = kmeans.closest_cluster(X[1], 1, M[1])
    closest_cluster_2 = kmeans.closest_cluster(X[2], 2, M[2])

    assert expected_closest_cluster_0 == closest_cluster_0
    assert expected_closest_cluster_1 == closest_cluster_1
    assert expected_closest_cluster_2 == closest_cluster_2

    # Also test whether the distances are set correctly
    expected_distances = [1.0, 4.0, 37.0]
    distances = kmeans.distances
    assert numpy.array_equal(expected_distances, distances)

    # Test when all MSEs return None (impossible but still testing behaviour)
    centroids = numpy.ones((2, 3))
    mask_centroids = [[0, 0, 1], [0, 0, 0]]
    kmeans.centroids = centroids
    kmeans.mask_centroids = mask_centroids

    expected_closest_cluster = 1
    closest_cluster = kmeans.closest_cluster(X[0], 0, M[0])
    assert expected_closest_cluster == closest_cluster
def test_find_point_furthest_away():
    X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]])
    M = numpy.array([[1,1,0],[0,1,0],[1,1,1]])
    K = 2
    kmeans = KMeans(X,M,K)
    
    # Equal distance for point 0
    centroids = [[1.0,3.0,1.0],[2.0,1.0,3.0]]
    mask_centroids = [[0,1,1],[1,1,0]] 
    kmeans.centroids = centroids
    kmeans.mask_centroids = mask_centroids
    
    kmeans.closest_cluster(X[0],0,M[0]) # MSE = 1.0 vs 1.0
    kmeans.closest_cluster(X[1],1,M[1]) # MSE = 4.0 vs 16.0
    kmeans.closest_cluster(X[2],2,M[2]) # MSE = 44.5 vs 37.0
    
    expected_furthest_away = 2
    furthest_away = kmeans.find_point_furthest_away()
    assert expected_furthest_away == furthest_away
def test_find_point_furthest_away():
    X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
    M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]])
    K = 2
    kmeans = KMeans(X, M, K)

    # Equal distance for point 0
    centroids = [[1.0, 3.0, 1.0], [2.0, 1.0, 3.0]]
    mask_centroids = [[0, 1, 1], [1, 1, 0]]
    kmeans.centroids = centroids
    kmeans.mask_centroids = mask_centroids

    kmeans.closest_cluster(X[0], 0, M[0])  # MSE = 1.0 vs 1.0
    kmeans.closest_cluster(X[1], 1, M[1])  # MSE = 4.0 vs 16.0
    kmeans.closest_cluster(X[2], 2, M[2])  # MSE = 44.5 vs 37.0

    expected_furthest_away = 2
    furthest_away = kmeans.find_point_furthest_away()
    assert expected_furthest_away == furthest_away
def test_update():
    # Normal case
    X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]])
    M = numpy.array([[1,1,0],[0,1,0],[1,1,1]])
    K = 2
    kmeans = KMeans(X,M,K)
    kmeans.data_point_assignments = numpy.array([[0,1],[2]]) #points 0,1 to cluster 0, point 2 to cluster 1
    kmeans.centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]]
    kmeans.mask_centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]]
    
    new_centroids = [[1.0,3.5,0],[7.0,8.0,9.0]]
    new_mask_centroids = [[1,1,0],[1,1,1]]
    kmeans.update()
    assert numpy.array_equal(new_centroids,kmeans.centroids)
    assert numpy.array_equal(new_mask_centroids,kmeans.mask_centroids)
    
    # Case when one cluster has no points assigned to it - we then randomly re-initialise that cluster
    X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]])
    M = numpy.array([[1,1,0],[0,1,0],[1,1,1]])
    K = 2
    kmeans = KMeans(X,M,K,'random')
    kmeans.data_point_assignments = numpy.array([[0,1,2],[]]) #points 0,1,2 to cluster 0, none to cluster 1
    kmeans.centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]]
    kmeans.mask_centroids = [[0.0,0.0,0.0],[0.0,0.0,0.0]]
    kmeans.mins = [1.0,2.0,9.0]
    kmeans.maxs = [7.0,8.0,9.0]
    
    new_centroids = [[4.0,5.0,9.0],[6.066531109150288,6.547726417641815,9.0]]
    new_mask_centroids = [[1,1,1],[1,1,1]]
    
    random.seed(0)
    kmeans.update()
    assert numpy.array_equal(new_centroids,kmeans.centroids)
    assert numpy.array_equal(new_mask_centroids,kmeans.mask_centroids)
    
    # Case when we use the 'singleton' option for empty clusters - reassign point furthest away to the cluster
    # Points 0 and 1 go to cluster 0, 2 to cluster 1 and none to cluster 2. 
    # Point 2 is furthest away, so gets reassigned to cluster 2 - making 
    # cluster 1 empty. Then point 1 is furthest away and gets reassigned to cluster 1
    X = numpy.array([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0]])
    M = numpy.array([[1,1,0],[0,1,0],[1,1,1]])
    K = 3
    kmeans = KMeans(X,M,K,resolve_empty='singleton')
    kmeans.data_point_assignments = numpy.array([[0,1],[2],[]]) #points 0,1 to cluster 0, 2 to cluster 1, none to cluster 2
    kmeans.cluster_assignments = [0,0,1]
    kmeans.centroids = [[1.0,2.0,3.0],[15.0,16.0,17.0],[500.0,500.0,500.0]]
    kmeans.mask_centroids = [[1,1,0],[1,1,1],[1,1,1]]
    kmeans.distances = numpy.array([
        kmeans.compute_MSE(kmeans.X[0],kmeans.centroids[0],M[0],kmeans.mask_centroids[0]),
        kmeans.compute_MSE(kmeans.X[1],kmeans.centroids[0],M[1],kmeans.mask_centroids[0]),
        kmeans.compute_MSE(kmeans.X[2],kmeans.centroids[1],M[2],kmeans.mask_centroids[1])
    ])
    kmeans.mins = [1.0,2.0,9.0]
    kmeans.maxs = [7.0,8.0,9.0]
    
    new_centroids = [[1.0,2.0,0],[4.0,5.0,6.0],[7.0,8.0,9.0]]
    new_data_point_assignments = [[0],[1],[2]]
    new_distances = [0,0,0]
    
    kmeans.update()
    
    assert new_data_point_assignments == list(kmeans.data_point_assignments)
    assert numpy.array_equal(new_distances,kmeans.distances)
    assert numpy.array_equal(new_centroids,kmeans.centroids)
def test_cluster():
    ### No missing values case.
    # Points 1,2 will first go to cluster 2, and point 3 to cluster 1.
    # Then point 1 will switch to cluster 1.
    X = [[2,5],[7,5],[2,3]]
    M = numpy.ones((3,2))
    K = 2
    kmeans = KMeans(X,M,K)
    
    kmeans.centroids = [[2.0,2.0],[4.0,5.0]]
    kmeans.mask_centroids = numpy.ones((2,2))
    kmeans.cluster_assignments = [-1,-1,-1]
    
    expected_centroids = [[2.0,4.0],[7.0,5.0]] 
    expected_cluster_assignments = [0,1,0]
    expected_data_point_assignments = [[0,2],[1]]
    expected_clustering_results = [[1,0],[0,1],[1,0]]
    
    kmeans.cluster()
    assert numpy.array_equal(expected_centroids,kmeans.centroids)
    assert numpy.array_equal(expected_cluster_assignments,kmeans.cluster_assignments)
    assert numpy.array_equal(expected_data_point_assignments,kmeans.data_point_assignments)
    assert numpy.array_equal(expected_clustering_results,kmeans.clustering_results)
    
    ### Missing values case.
    # Points 2,3,4 will first go to cluster 2, and point 1 to cluster 1.
    # Then point 2 will switch to cluster 1.
    X = [[2,5],[3,-1],[10,1],[-1,2]]
    M = [[1,1],[1,0],[1,1],[0,1]]
    K = 2
    kmeans = KMeans(X,M,K)
    
    kmeans.centroids = [[2.0,7.0],[3.0,2.0]]
    kmeans.mask_centroids = numpy.ones((2,2))
    kmeans.cluster_assignments = [-1,-1,-1,-1]
    
    expected_centroids = [[2.5,5.0],[10.0,1.5]] 
    expected_cluster_assignments = [0,0,1,1]
    expected_data_point_assignments = [[0,1],[2,3]]
    expected_clustering_results = [[1,0],[1,0],[0,1],[0,1]]
    
    kmeans.cluster()
    assert numpy.array_equal(expected_centroids,kmeans.centroids)
    assert numpy.array_equal(expected_cluster_assignments,kmeans.cluster_assignments)
    assert numpy.array_equal(expected_data_point_assignments,kmeans.data_point_assignments)
    assert numpy.array_equal(expected_clustering_results,kmeans.clustering_results)
    
    ### Cluster with 0 coordinate.
    # Cluster 1 gets points 1 and 2, cluster 2 gets 3 and 4.
    X = [[2,5],[3,-1],[-1,1],[-1,2]]
    M = [[1,1],[1,0],[0,1],[0,1]]
    K = 2
    kmeans = KMeans(X,M,K)
    
    kmeans.centroids = [[2.0,7.0],[4.0,4.0]]
    kmeans.mask_centroids = numpy.ones((2,2))
    kmeans.cluster_assignments = [-1,-1,-1,-1]
    
    expected_centroids = [[2.5,5.0],[0,1.5]] 
    expected_cluster_assignments = [0,0,1,1]
    expected_data_point_assignments = [[0,1],[2,3]]
    expected_clustering_results = [[1,0],[1,0],[0,1],[0,1]]
    
    kmeans.cluster()
    assert numpy.array_equal(expected_centroids,kmeans.centroids)
    assert numpy.array_equal(expected_cluster_assignments,kmeans.cluster_assignments)
    assert numpy.array_equal(expected_data_point_assignments,kmeans.data_point_assignments)
    assert numpy.array_equal(expected_clustering_results,kmeans.clustering_results)
def test_update():
    # Normal case
    X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
    M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]])
    K = 2
    kmeans = KMeans(X, M, K)
    kmeans.data_point_assignments = numpy.array(
        [[0, 1], [2]])  #points 0,1 to cluster 0, point 2 to cluster 1
    kmeans.centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
    kmeans.mask_centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]

    new_centroids = [[1.0, 3.5, 0], [7.0, 8.0, 9.0]]
    new_mask_centroids = [[1, 1, 0], [1, 1, 1]]
    kmeans.update()
    assert numpy.array_equal(new_centroids, kmeans.centroids)
    assert numpy.array_equal(new_mask_centroids, kmeans.mask_centroids)

    # Case when one cluster has no points assigned to it - we then randomly re-initialise that cluster
    X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
    M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]])
    K = 2
    kmeans = KMeans(X, M, K, 'random')
    kmeans.data_point_assignments = numpy.array(
        [[0, 1, 2], []])  #points 0,1,2 to cluster 0, none to cluster 1
    kmeans.centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
    kmeans.mask_centroids = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
    kmeans.mins = [1.0, 2.0, 9.0]
    kmeans.maxs = [7.0, 8.0, 9.0]

    new_centroids = [[4.0, 5.0, 9.0],
                     [6.066531109150288, 6.547726417641815, 9.0]]
    new_mask_centroids = [[1, 1, 1], [1, 1, 1]]

    random.seed(0)
    kmeans.update()
    assert numpy.array_equal(new_centroids, kmeans.centroids)
    assert numpy.array_equal(new_mask_centroids, kmeans.mask_centroids)

    # Case when we use the 'singleton' option for empty clusters - reassign point furthest away to the cluster
    # Points 0 and 1 go to cluster 0, 2 to cluster 1 and none to cluster 2.
    # Point 2 is furthest away, so gets reassigned to cluster 2 - making
    # cluster 1 empty. Then point 1 is furthest away and gets reassigned to cluster 1
    X = numpy.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
    M = numpy.array([[1, 1, 0], [0, 1, 0], [1, 1, 1]])
    K = 3
    kmeans = KMeans(X, M, K, resolve_empty='singleton')
    kmeans.data_point_assignments = numpy.array(
        [[0, 1], [2],
         []])  #points 0,1 to cluster 0, 2 to cluster 1, none to cluster 2
    kmeans.cluster_assignments = [0, 0, 1]
    kmeans.centroids = [[1.0, 2.0, 3.0], [15.0, 16.0, 17.0],
                        [500.0, 500.0, 500.0]]
    kmeans.mask_centroids = [[1, 1, 0], [1, 1, 1], [1, 1, 1]]
    kmeans.distances = numpy.array([
        kmeans.compute_MSE(kmeans.X[0], kmeans.centroids[0], M[0],
                           kmeans.mask_centroids[0]),
        kmeans.compute_MSE(kmeans.X[1], kmeans.centroids[0], M[1],
                           kmeans.mask_centroids[0]),
        kmeans.compute_MSE(kmeans.X[2], kmeans.centroids[1], M[2],
                           kmeans.mask_centroids[1])
    ])
    kmeans.mins = [1.0, 2.0, 9.0]
    kmeans.maxs = [7.0, 8.0, 9.0]

    new_centroids = [[1.0, 2.0, 0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
    new_data_point_assignments = [[0], [1], [2]]
    new_distances = [0, 0, 0]

    kmeans.update()

    assert new_data_point_assignments == list(kmeans.data_point_assignments)
    assert numpy.array_equal(new_distances, kmeans.distances)
    assert numpy.array_equal(new_centroids, kmeans.centroids)
Beispiel #10
0
def test_cluster():
    ### No missing values case.
    # Points 1,2 will first go to cluster 2, and point 3 to cluster 1.
    # Then point 1 will switch to cluster 1.
    X = [[2, 5], [7, 5], [2, 3]]
    M = numpy.ones((3, 2))
    K = 2
    kmeans = KMeans(X, M, K)

    kmeans.centroids = [[2.0, 2.0], [4.0, 5.0]]
    kmeans.mask_centroids = numpy.ones((2, 2))
    kmeans.cluster_assignments = [-1, -1, -1]

    expected_centroids = [[2.0, 4.0], [7.0, 5.0]]
    expected_cluster_assignments = [0, 1, 0]
    expected_data_point_assignments = [[0, 2], [1]]
    expected_clustering_results = [[1, 0], [0, 1], [1, 0]]

    kmeans.cluster()
    assert numpy.array_equal(expected_centroids, kmeans.centroids)
    assert numpy.array_equal(expected_cluster_assignments,
                             kmeans.cluster_assignments)
    assert numpy.array_equal(expected_data_point_assignments,
                             kmeans.data_point_assignments)
    assert numpy.array_equal(expected_clustering_results,
                             kmeans.clustering_results)

    ### Missing values case.
    # Points 2,3,4 will first go to cluster 2, and point 1 to cluster 1.
    # Then point 2 will switch to cluster 1.
    X = [[2, 5], [3, -1], [10, 1], [-1, 2]]
    M = [[1, 1], [1, 0], [1, 1], [0, 1]]
    K = 2
    kmeans = KMeans(X, M, K)

    kmeans.centroids = [[2.0, 7.0], [3.0, 2.0]]
    kmeans.mask_centroids = numpy.ones((2, 2))
    kmeans.cluster_assignments = [-1, -1, -1, -1]

    expected_centroids = [[2.5, 5.0], [10.0, 1.5]]
    expected_cluster_assignments = [0, 0, 1, 1]
    expected_data_point_assignments = [[0, 1], [2, 3]]
    expected_clustering_results = [[1, 0], [1, 0], [0, 1], [0, 1]]

    kmeans.cluster()
    assert numpy.array_equal(expected_centroids, kmeans.centroids)
    assert numpy.array_equal(expected_cluster_assignments,
                             kmeans.cluster_assignments)
    assert numpy.array_equal(expected_data_point_assignments,
                             kmeans.data_point_assignments)
    assert numpy.array_equal(expected_clustering_results,
                             kmeans.clustering_results)

    ### Cluster with 0 coordinate.
    # Cluster 1 gets points 1 and 2, cluster 2 gets 3 and 4.
    X = [[2, 5], [3, -1], [-1, 1], [-1, 2]]
    M = [[1, 1], [1, 0], [0, 1], [0, 1]]
    K = 2
    kmeans = KMeans(X, M, K)

    kmeans.centroids = [[2.0, 7.0], [4.0, 4.0]]
    kmeans.mask_centroids = numpy.ones((2, 2))
    kmeans.cluster_assignments = [-1, -1, -1, -1]

    expected_centroids = [[2.5, 5.0], [0, 1.5]]
    expected_cluster_assignments = [0, 0, 1, 1]
    expected_data_point_assignments = [[0, 1], [2, 3]]
    expected_clustering_results = [[1, 0], [1, 0], [0, 1], [0, 1]]

    kmeans.cluster()
    assert numpy.array_equal(expected_centroids, kmeans.centroids)
    assert numpy.array_equal(expected_cluster_assignments,
                             kmeans.cluster_assignments)
    assert numpy.array_equal(expected_data_point_assignments,
                             kmeans.data_point_assignments)
    assert numpy.array_equal(expected_clustering_results,
                             kmeans.clustering_results)