def ScalableKMeansPlusPlus(data, k, l, iter=5): """ Apply the KMeans|| clustering algorithm Parameters: data ndarrays data k number of cluster l number of point sampled in each iteration Returns: the final centroids finded by KMeans|| """ centroids = data[np.random.choice(range(data.shape[0]), 1), :] for i in range(iter): #Get the distance between data and centroids dist = distance(data, centroids) #Calculate the cost of data with respect to the centroids norm_const = cost(dist) #Calculate the distribution for sampling l new centers p = distribution(dist, norm_const) #Sample the l new centers and append them to the original ones centroids = np.r_[centroids, sample_new(data, p, l)] ## reduce k*l to k using KMeans++ dist = distance(data, centroids) weights = get_weight(dist, centroids) return centroids[ np.random.choice(len(weights), k, replace=False, p=weights), :]
def ScalableKMeansPlusPlus(data, k, l, r): cent_pos = np.random.choice(range(data.shape[0]), 1) centroids = data[cent_pos, :] for i in range(1, r + 1): #Get the distance between data and centroids dist = distance(data, centroids, cent_pos) #Calculate the cost of data with respect to the centroids norm_const = cost_s(dist, len(data)) #Calculate the distribution for sampling l new centers p = distribution(dist, norm_const) #Sample the l new centers and append them to the original ones pos = sample_new(p, l) cent_pos = np.append(cent_pos, pos) centroids = np.r_[centroids, data[pos]] dist = distance(data, centroids, cent_pos) w, s = get_weight(dist, centroids, cent_pos) weights = w / s centroid_one_ind = np.random.choice(len(weights), 1, p=weights) # employing weighted Spherical K-Means ++ to obtain final k cluster centers centroids_ini_spkm_para, cent_pos_wk = wkmeanspp(centroids, cent_pos, k, w, centroid_one_ind) return centroids_ini_spkm_para
def KMeansPlusPlus(data, k): """ Apply the KMeans++ clustering algorithm to get the initial centroids Parameters: data ndarrays data k number of cluster Returns: "Centroids" the complete initial centroids by KMeans++ """ #Initialize the first centroid centroids = data[np.random.choice(data.shape[0], 1), :] while centroids.shape[0] < k: #Get the distance between data and centroids dist = distance(data, centroids) #Calculate the cost of data with respect to the centroids norm_const = cost(dist) #Calculate the distribution for sampling a new center p = distribution(dist, norm_const) #Sample the new center and append it to the original ones centroids = np.r_[centroids, sample_new(data, p, 1)] return centroids
def test_sum_to_one(): data = np.random.normal(size=(20,4)) centroids = data[np.random.choice(range(4),4),] dist = distance(data,centroids) c = cost(dist) p = distribution(dist,c) assert_almost_equal(np.sum(p),1)
def test_non_negative(): data = np.random.normal(size=(20,4)) centroids = data[np.random.choice(range(4),4),] dist = distance(data,centroids) c = cost(dist) p = distribution(dist,c) assert (p>=0).all()
def test_length(): data = np.random.normal(size=(20,4)) centroids = data[np.random.choice(range(4),4),] dist = distance(data,centroids) c = cost(dist) p = distribution(dist,c) l = 5 c_new = sample_new(data,p,l) assert len(c_new)==5
def test_in_data(): data = np.random.normal(size=(20,4)) centroids = data[np.random.choice(range(4),4),] dist = distance(data,centroids) c = cost(dist) p = distribution(dist,c) l = 5 c_new = sample_new(data,p,l) check = [i in data for i in c_new] assert all(check)
def wkmeanspp(data, cent_pos, k, w, one_ind): """ Apply the KMeans++ clustering algorithm to get the initial centroids Parameters: data ndarrays data cent_pos indices of the selected centroids k number of cluster w weights assigned to centroids one_ind index of the first randomly sampled centroid Returns: actual_cent the complete initial centroids by SPKM++ centroids the indices of the initial centroids """ #Initialize the first centroid centroids = data[one_ind,:] cent_pos_wk = np.array([one_ind]) actual_cent = np.array([cent_pos[one_ind]]) #print(len(data)) while centroids.shape[0] < k : #Get the distance between data and centroids dist = distance(data, centroids, cent_pos_wk) #print(dist.shape) #Calculate the cost of data with respect to the centroids norm_const = w_cost(dist, w) #Calculate the distribution for sampling a new center p = w_distribution(dist,norm_const,w) #print(len(p)) #Sample the new center and append it to the original ones pos = w_sample_new(p,1) #print(pos) #print("###") cent_pos_wk = np.append(cent_pos_wk, pos) actual_cent = np.append(actual_cent, cent_pos[pos]) centroids = np.r_[centroids, data[pos,:]] return centroids, actual_cent
def KMeans(data, k, centroids, max_iter = 1000): """ Apply the KMeans clustering algorithm Parameters: data ndarrays data k number of cluster centroids initial centroids Returns: "Iteration before Coverge" time used to converge "Centroids" the final centroids finded by KMeans "Labels" the cluster of each data """ n = data.shape[0] iterations = 0 while iterations < max_iter: dist = distance(data,centroids) ## give cluster label to each point cluster_label = np.argmin(dist, axis=1) ## calculate new centroids newCentroids = np.zeros(centroids.shape) #print cluster_label for j in range(0, k): if sum(cluster_label == j) == 0: newCentroids[j] = centroids[j] else: newCentroids[j] = np.mean(data[cluster_label == j, :], axis=0) newCentroids[j] = np.divide(newCentroids[j], float(np.sqrt(np.sum(np.square(newCentroids[j]))))) ## Check if it has converged if np.array_equal(centroids, newCentroids): print("Converge! after:",iterations,"iterations") break centroids = newCentroids iterations += 1 return({"Iteration before Coverge": iterations, "Centroids": centroids, "Labels": cluster_label})
def weightedKMeans(data, k, weight, centroids, max_iter=10000): """ Apply the weighted KMeans clustering algorithm Parameters: data ndarrays data k number of cluster weight weight matrix of data centroids initial centroids Returns: "Iteration before Coverge" time used to converge "Centroids" the final centroids finded by KMeans "Labels" the cluster of each data """ n = data.shape[0] iterations = 0 while iterations < max_iter: dist = distance(data, centroids) * weight[:, np.newaxis] ## give cluster label to each point cluster_label = np.argmin(dist, axis=1) ## calculate new centroids newCentroids = np.zeros(centroids.shape) for j in range(0, k): if sum(cluster_label == j) == 0: newCentroids[j] = centroids[j] else: newCentroids[j] = np.mean(data[cluster_label == j, :], axis=0) ## Check if it is converged if np.array_equal(centroids, newCentroids): print("Converge") break centroids = newCentroids iterations += 1 return (centroids)
def test_non_negative(): for i in range(10): data = np.random.normal(size=(5, 4)) c = data[np.random.choice(range(4), 2), ] dist = distance(data, c) assert cost(dist) >= 0
def test_known1(): u = np.array([[0, 0], [1, 1]]) v = np.array([[0, 0], [1, 1]]) dist = np.array([[0, 2], [2, 0]]) assert_almost_equal(distance(u, v), dist)
def test_sum_to_one(): data = np.random.normal(size=(20, 4)) centroids = data[np.random.choice(range(4), 4), ] dist = distance(data, centroids) w = get_weight(dist, centroids) assert_almost_equal(np.sum(w), 1)
def test_coincidence_when_zero(): u = np.zeros((3, 4)) v = np.zeros((5, 4)) assert (distance(u, v) == 0).all()
def clusterCostseed(data, predict, cent_pos_wk): # clustering cost right after seeding initial k-centers dist = distance(data, predict, cent_pos_wk) return cost_s(dist, len(data)) / (10**2)
def test_non_negativity(): u = np.random.normal(size=(3, 4)) v = np.random.normal(size=(5, 4)) assert (distance(u, v) >= 0).all()
def test_known2(): u = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]]) v = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]]) dist = np.array([[3, 12, 27], [0, 3, 12], [3, 0, 3]]) assert_almost_equal(distance(u, v), dist)
def test_coincidence_when_not_zero(): u = np.random.normal(size=(3, 4)) v = np.random.normal(size=(5, 4)) assert (distance(u, v) != 0).any()
def test_non_negative(): data = np.random.normal(size=(20, 4)) centroids = data[np.random.choice(range(4), 4), ] dist = distance(data, centroids) w = get_weight(dist, centroids) assert (w >= 0).all()
def findClosestPoint(ctr, point_ind): # given a cluster centroid and the indices of points belonging to that cluster, returns the index of the closest point to the centroid # mean -- also called the "concept" of that cluster points = list(data[point_ind]) distances = distance(points, ctr) return np.argmin(distances)
def test_symmetry(): u = np.random.normal(size=(3, 4)) v = np.random.normal(size=(5, 4)) assert (distance(u, v) == distance(v, u).T).all()