def ScalableKMeansPlusPlus(data, k, l, iter=5):
    """ Apply the KMeans|| clustering algorithm
    
    Parameters:
      data     ndarrays data 
      k        number of cluster
      l        number of point sampled in each iteration
    
    Returns:   the final centroids finded by KMeans||  
      
    """

    centroids = data[np.random.choice(range(data.shape[0]), 1), :]

    for i in range(iter):
        #Get the distance between data and centroids
        dist = distance(data, centroids)

        #Calculate the cost of data with respect to the centroids
        norm_const = cost(dist)

        #Calculate the distribution for sampling l new centers
        p = distribution(dist, norm_const)

        #Sample the l new centers and append them to the original ones
        centroids = np.r_[centroids, sample_new(data, p, l)]

    ## reduce k*l to k using KMeans++
    dist = distance(data, centroids)
    weights = get_weight(dist, centroids)

    return centroids[
        np.random.choice(len(weights), k, replace=False, p=weights), :]
def ScalableKMeansPlusPlus(data, k, l, r):

    cent_pos = np.random.choice(range(data.shape[0]), 1)
    centroids = data[cent_pos, :]

    for i in range(1, r + 1):

        #Get the distance between data and centroids
        dist = distance(data, centroids, cent_pos)
        #Calculate the cost of data with respect to the centroids
        norm_const = cost_s(dist, len(data))
        #Calculate the distribution for sampling l new centers
        p = distribution(dist, norm_const)
        #Sample the l new centers and append them to the original ones
        pos = sample_new(p, l)

        cent_pos = np.append(cent_pos, pos)
        centroids = np.r_[centroids, data[pos]]

    dist = distance(data, centroids, cent_pos)
    w, s = get_weight(dist, centroids, cent_pos)

    weights = w / s

    centroid_one_ind = np.random.choice(len(weights), 1, p=weights)
    # employing weighted Spherical K-Means ++ to obtain final k cluster centers
    centroids_ini_spkm_para, cent_pos_wk = wkmeanspp(centroids, cent_pos, k, w,
                                                     centroid_one_ind)

    return centroids_ini_spkm_para
def KMeansPlusPlus(data, k):
    """ Apply the KMeans++ clustering algorithm to get the initial centroids   
    Parameters: 
      data                        ndarrays data 
      k                           number of cluster
    
    Returns:
      "Centroids"                 the complete initial centroids by KMeans++
      
    """

    #Initialize the first centroid
    centroids = data[np.random.choice(data.shape[0], 1), :]

    while centroids.shape[0] < k:

        #Get the distance between data and centroids
        dist = distance(data, centroids)

        #Calculate the cost of data with respect to the centroids
        norm_const = cost(dist)

        #Calculate the distribution for sampling a new center
        p = distribution(dist, norm_const)

        #Sample the new center and append it to the original ones
        centroids = np.r_[centroids, sample_new(data, p, 1)]

    return centroids
Esempio n. 4
0
def test_sum_to_one():
    data = np.random.normal(size=(20,4))
    centroids = data[np.random.choice(range(4),4),]
    dist = distance(data,centroids)
    c = cost(dist)
    p = distribution(dist,c)
    assert_almost_equal(np.sum(p),1)
Esempio n. 5
0
def test_non_negative():
    data = np.random.normal(size=(20,4))
    centroids = data[np.random.choice(range(4),4),]
    dist = distance(data,centroids)
    c = cost(dist)
    p = distribution(dist,c)
    assert (p>=0).all()
def test_length():
    data = np.random.normal(size=(20,4))
    centroids = data[np.random.choice(range(4),4),]
    dist = distance(data,centroids)
    c = cost(dist)
    p = distribution(dist,c)
    l = 5
    c_new = sample_new(data,p,l)
    assert len(c_new)==5
def test_in_data():
    data = np.random.normal(size=(20,4))
    centroids = data[np.random.choice(range(4),4),]
    dist = distance(data,centroids)
    c = cost(dist)
    p = distribution(dist,c)
    l = 5
    c_new = sample_new(data,p,l)
    check = [i in data for i in c_new]
    assert all(check)
    
Esempio n. 8
0
def wkmeanspp(data, cent_pos, k, w, one_ind):    
    """ Apply the KMeans++ clustering algorithm to get the initial centroids   
    Parameters: 
      data                        ndarrays data 
      cent_pos                    indices of the selected centroids
      k                           number of cluster
      w                           weights assigned to centroids
      one_ind                     index of the first randomly sampled centroid
    
    Returns:
      actual_cent                 the complete initial centroids by SPKM++
      centroids                   the indices of the initial centroids
      
    """
    
    #Initialize the first centroid
    centroids = data[one_ind,:]
    cent_pos_wk = np.array([one_ind])
    actual_cent = np.array([cent_pos[one_ind]])
    #print(len(data))
    while centroids.shape[0] < k :
        
        #Get the distance between data and centroids
        dist = distance(data, centroids, cent_pos_wk)
        
        #print(dist.shape)
        #Calculate the cost of data with respect to the centroids
        norm_const = w_cost(dist, w)

        
        #Calculate the distribution for sampling a new center
        p = w_distribution(dist,norm_const,w)
        #print(len(p))

        #Sample the new center and append it to the original ones
        pos = w_sample_new(p,1)
        #print(pos)
        #print("###")
        
        
        cent_pos_wk = np.append(cent_pos_wk, pos)
        actual_cent = np.append(actual_cent, cent_pos[pos])
        centroids = np.r_[centroids, data[pos,:]]
    
    return centroids, actual_cent
def KMeans(data, k, centroids, max_iter = 1000): 
    
    """ Apply the KMeans clustering algorithm
    
    Parameters:
      data                        ndarrays data 
      k                           number of cluster
      centroids                   initial centroids
    
    Returns:
      "Iteration before Coverge"  time used to converge
      "Centroids"                 the final centroids finded by KMeans    
      "Labels"                    the cluster of each data   
    """
    
    n = data.shape[0] 
    iterations = 0
    
    while iterations < max_iter:        
        dist = distance(data,centroids)
        
        ## give cluster label to each point 
        cluster_label = np.argmin(dist, axis=1)
        
        ## calculate new centroids
        newCentroids = np.zeros(centroids.shape)
        #print cluster_label
        for j in range(0, k):
            if sum(cluster_label == j) == 0:
                newCentroids[j] = centroids[j]
            else:
                newCentroids[j] = np.mean(data[cluster_label == j, :], axis=0)
                newCentroids[j] = np.divide(newCentroids[j], float(np.sqrt(np.sum(np.square(newCentroids[j])))))
        
        ## Check if it has converged
        if np.array_equal(centroids, newCentroids):
            print("Converge! after:",iterations,"iterations")
            break 
            
        centroids = newCentroids
        iterations += 1
        
    return({"Iteration before Coverge": iterations, 
            "Centroids": centroids, 
            "Labels": cluster_label})
Esempio n. 10
0
def weightedKMeans(data, k, weight, centroids, max_iter=10000):
    """ Apply the weighted KMeans clustering algorithm
    
    Parameters:
      data                        ndarrays data 
      k                           number of cluster
      weight                      weight matrix of data
      centroids                   initial centroids
    
    Returns:
      "Iteration before Coverge"  time used to converge
      "Centroids"                 the final centroids finded by KMeans    
      "Labels"                    the cluster of each data   
    """

    n = data.shape[0]
    iterations = 0

    while iterations < max_iter:
        dist = distance(data, centroids) * weight[:, np.newaxis]

        ## give cluster label to each point
        cluster_label = np.argmin(dist, axis=1)

        ## calculate new centroids
        newCentroids = np.zeros(centroids.shape)
        for j in range(0, k):
            if sum(cluster_label == j) == 0:
                newCentroids[j] = centroids[j]
            else:
                newCentroids[j] = np.mean(data[cluster_label == j, :], axis=0)

        ## Check if it is converged
        if np.array_equal(centroids, newCentroids):
            print("Converge")
            break

        centroids = newCentroids
        iterations += 1

    return (centroids)
Esempio n. 11
0
def test_non_negative():
    for i in range(10):
        data = np.random.normal(size=(5, 4))
        c = data[np.random.choice(range(4), 2), ]
        dist = distance(data, c)
        assert cost(dist) >= 0
def test_known1():
    u = np.array([[0, 0], [1, 1]])
    v = np.array([[0, 0], [1, 1]])
    dist = np.array([[0, 2], [2, 0]])
    assert_almost_equal(distance(u, v), dist)
def test_sum_to_one():
    data = np.random.normal(size=(20, 4))
    centroids = data[np.random.choice(range(4), 4), ]
    dist = distance(data, centroids)
    w = get_weight(dist, centroids)
    assert_almost_equal(np.sum(w), 1)
def test_coincidence_when_zero():
    u = np.zeros((3, 4))
    v = np.zeros((5, 4))
    assert (distance(u, v) == 0).all()
def clusterCostseed(data, predict, cent_pos_wk):
    # clustering cost right after seeding initial k-centers
    dist = distance(data, predict, cent_pos_wk)
    return cost_s(dist, len(data)) / (10**2)
def test_non_negativity():
    u = np.random.normal(size=(3, 4))
    v = np.random.normal(size=(5, 4))
    assert (distance(u, v) >= 0).all()
def test_known2():
    u = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
    v = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
    dist = np.array([[3, 12, 27], [0, 3, 12], [3, 0, 3]])
    assert_almost_equal(distance(u, v), dist)
def test_coincidence_when_not_zero():
    u = np.random.normal(size=(3, 4))
    v = np.random.normal(size=(5, 4))
    assert (distance(u, v) != 0).any()
def test_non_negative():
    data = np.random.normal(size=(20, 4))
    centroids = data[np.random.choice(range(4), 4), ]
    dist = distance(data, centroids)
    w = get_weight(dist, centroids)
    assert (w >= 0).all()
def findClosestPoint(ctr, point_ind):
    # given a cluster centroid and the indices of points belonging to that cluster, returns the index of the closest point to the centroid
    # mean -- also called the "concept" of that cluster
    points = list(data[point_ind])
    distances = distance(points, ctr)
    return np.argmin(distances)
def test_symmetry():
    u = np.random.normal(size=(3, 4))
    v = np.random.normal(size=(5, 4))
    assert (distance(u, v) == distance(v, u).T).all()