def ScalableKMeansPlusPlus(data, k, l, iter=5):
    """ Apply the KMeans|| clustering algorithm
      data     ndarrays data 
      k        number of cluster
      l        number of point sampled in each iteration
    Returns:   the final centroids finded by KMeans||  

    centroids = data[np.random.choice(range(data.shape[0]), 1), :]

    for i in range(iter):
        #Get the distance between data and centroids
        dist = distance(data, centroids)

        #Calculate the cost of data with respect to the centroids
        norm_const = cost(dist)

        #Calculate the distribution for sampling l new centers
        p = distribution(dist, norm_const)

        #Sample the l new centers and append them to the original ones
        centroids = np.r_[centroids, sample_new(data, p, l)]

    ## reduce k*l to k using KMeans++
    dist = distance(data, centroids)
    weights = get_weight(dist, centroids)

    return centroids[
        np.random.choice(len(weights), k, replace=False, p=weights), :]
def ScalableKMeansPlusPlus(data, k, l, r):

    cent_pos = np.random.choice(range(data.shape[0]), 1)
    centroids = data[cent_pos, :]

    for i in range(1, r + 1):

        #Get the distance between data and centroids
        dist = distance(data, centroids, cent_pos)
        #Calculate the cost of data with respect to the centroids
        norm_const = cost_s(dist, len(data))
        #Calculate the distribution for sampling l new centers
        p = distribution(dist, norm_const)
        #Sample the l new centers and append them to the original ones
        pos = sample_new(p, l)

        cent_pos = np.append(cent_pos, pos)
        centroids = np.r_[centroids, data[pos]]

    dist = distance(data, centroids, cent_pos)
    w, s = get_weight(dist, centroids, cent_pos)

    weights = w / s

    centroid_one_ind = np.random.choice(len(weights), 1, p=weights)
    # employing weighted Spherical K-Means ++ to obtain final k cluster centers
    centroids_ini_spkm_para, cent_pos_wk = wkmeanspp(centroids, cent_pos, k, w,

    return centroids_ini_spkm_para
def KMeansPlusPlus(data, k):
    """ Apply the KMeans++ clustering algorithm to get the initial centroids   
      data                        ndarrays data 
      k                           number of cluster
      "Centroids"                 the complete initial centroids by KMeans++

    #Initialize the first centroid
    centroids = data[np.random.choice(data.shape[0], 1), :]

    while centroids.shape[0] < k:

        #Get the distance between data and centroids
        dist = distance(data, centroids)

        #Calculate the cost of data with respect to the centroids
        norm_const = cost(dist)

        #Calculate the distribution for sampling a new center
        p = distribution(dist, norm_const)

        #Sample the new center and append it to the original ones
        centroids = np.r_[centroids, sample_new(data, p, 1)]

    return centroids
Example #4
def test_sum_to_one():
    data = np.random.normal(size=(20,4))
    centroids = data[np.random.choice(range(4),4),]
    dist = distance(data,centroids)
    c = cost(dist)
    p = distribution(dist,c)
Example #5
def test_non_negative():
    data = np.random.normal(size=(20,4))
    centroids = data[np.random.choice(range(4),4),]
    dist = distance(data,centroids)
    c = cost(dist)
    p = distribution(dist,c)
    assert (p>=0).all()
def test_length():
    data = np.random.normal(size=(20,4))
    centroids = data[np.random.choice(range(4),4),]
    dist = distance(data,centroids)
    c = cost(dist)
    p = distribution(dist,c)
    l = 5
    c_new = sample_new(data,p,l)
    assert len(c_new)==5
def test_in_data():
    data = np.random.normal(size=(20,4))
    centroids = data[np.random.choice(range(4),4),]
    dist = distance(data,centroids)
    c = cost(dist)
    p = distribution(dist,c)
    l = 5
    c_new = sample_new(data,p,l)
    check = [i in data for i in c_new]
    assert all(check)
Example #8
def wkmeanspp(data, cent_pos, k, w, one_ind):    
    """ Apply the KMeans++ clustering algorithm to get the initial centroids   
      data                        ndarrays data 
      cent_pos                    indices of the selected centroids
      k                           number of cluster
      w                           weights assigned to centroids
      one_ind                     index of the first randomly sampled centroid
      actual_cent                 the complete initial centroids by SPKM++
      centroids                   the indices of the initial centroids
    #Initialize the first centroid
    centroids = data[one_ind,:]
    cent_pos_wk = np.array([one_ind])
    actual_cent = np.array([cent_pos[one_ind]])
    while centroids.shape[0] < k :
        #Get the distance between data and centroids
        dist = distance(data, centroids, cent_pos_wk)
        #Calculate the cost of data with respect to the centroids
        norm_const = w_cost(dist, w)

        #Calculate the distribution for sampling a new center
        p = w_distribution(dist,norm_const,w)

        #Sample the new center and append it to the original ones
        pos = w_sample_new(p,1)
        cent_pos_wk = np.append(cent_pos_wk, pos)
        actual_cent = np.append(actual_cent, cent_pos[pos])
        centroids = np.r_[centroids, data[pos,:]]
    return centroids, actual_cent
def KMeans(data, k, centroids, max_iter = 1000): 
    """ Apply the KMeans clustering algorithm
      data                        ndarrays data 
      k                           number of cluster
      centroids                   initial centroids
      "Iteration before Coverge"  time used to converge
      "Centroids"                 the final centroids finded by KMeans    
      "Labels"                    the cluster of each data   
    n = data.shape[0] 
    iterations = 0
    while iterations < max_iter:        
        dist = distance(data,centroids)
        ## give cluster label to each point 
        cluster_label = np.argmin(dist, axis=1)
        ## calculate new centroids
        newCentroids = np.zeros(centroids.shape)
        #print cluster_label
        for j in range(0, k):
            if sum(cluster_label == j) == 0:
                newCentroids[j] = centroids[j]
                newCentroids[j] = np.mean(data[cluster_label == j, :], axis=0)
                newCentroids[j] = np.divide(newCentroids[j], float(np.sqrt(np.sum(np.square(newCentroids[j])))))
        ## Check if it has converged
        if np.array_equal(centroids, newCentroids):
            print("Converge! after:",iterations,"iterations")
        centroids = newCentroids
        iterations += 1
    return({"Iteration before Coverge": iterations, 
            "Centroids": centroids, 
            "Labels": cluster_label})
Example #10
def weightedKMeans(data, k, weight, centroids, max_iter=10000):
    """ Apply the weighted KMeans clustering algorithm
      data                        ndarrays data 
      k                           number of cluster
      weight                      weight matrix of data
      centroids                   initial centroids
      "Iteration before Coverge"  time used to converge
      "Centroids"                 the final centroids finded by KMeans    
      "Labels"                    the cluster of each data   

    n = data.shape[0]
    iterations = 0

    while iterations < max_iter:
        dist = distance(data, centroids) * weight[:, np.newaxis]

        ## give cluster label to each point
        cluster_label = np.argmin(dist, axis=1)

        ## calculate new centroids
        newCentroids = np.zeros(centroids.shape)
        for j in range(0, k):
            if sum(cluster_label == j) == 0:
                newCentroids[j] = centroids[j]
                newCentroids[j] = np.mean(data[cluster_label == j, :], axis=0)

        ## Check if it is converged
        if np.array_equal(centroids, newCentroids):

        centroids = newCentroids
        iterations += 1

    return (centroids)
Example #11
def test_non_negative():
    for i in range(10):
        data = np.random.normal(size=(5, 4))
        c = data[np.random.choice(range(4), 2), ]
        dist = distance(data, c)
        assert cost(dist) >= 0
def test_known1():
    u = np.array([[0, 0], [1, 1]])
    v = np.array([[0, 0], [1, 1]])
    dist = np.array([[0, 2], [2, 0]])
    assert_almost_equal(distance(u, v), dist)
def test_sum_to_one():
    data = np.random.normal(size=(20, 4))
    centroids = data[np.random.choice(range(4), 4), ]
    dist = distance(data, centroids)
    w = get_weight(dist, centroids)
    assert_almost_equal(np.sum(w), 1)
def test_coincidence_when_zero():
    u = np.zeros((3, 4))
    v = np.zeros((5, 4))
    assert (distance(u, v) == 0).all()
def clusterCostseed(data, predict, cent_pos_wk):
    # clustering cost right after seeding initial k-centers
    dist = distance(data, predict, cent_pos_wk)
    return cost_s(dist, len(data)) / (10**2)
def test_non_negativity():
    u = np.random.normal(size=(3, 4))
    v = np.random.normal(size=(5, 4))
    assert (distance(u, v) >= 0).all()
def test_known2():
    u = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
    v = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
    dist = np.array([[3, 12, 27], [0, 3, 12], [3, 0, 3]])
    assert_almost_equal(distance(u, v), dist)
def test_coincidence_when_not_zero():
    u = np.random.normal(size=(3, 4))
    v = np.random.normal(size=(5, 4))
    assert (distance(u, v) != 0).any()
def test_non_negative():
    data = np.random.normal(size=(20, 4))
    centroids = data[np.random.choice(range(4), 4), ]
    dist = distance(data, centroids)
    w = get_weight(dist, centroids)
    assert (w >= 0).all()
def findClosestPoint(ctr, point_ind):
    # given a cluster centroid and the indices of points belonging to that cluster, returns the index of the closest point to the centroid
    # mean -- also called the "concept" of that cluster
    points = list(data[point_ind])
    distances = distance(points, ctr)
    return np.argmin(distances)
def test_symmetry():
    u = np.random.normal(size=(3, 4))
    v = np.random.normal(size=(5, 4))
    assert (distance(u, v) == distance(v, u).T).all()