def __call__(self): """ Do an iteration of the Lloyd algorithm. """ centers = np.array(list(self.gs.init_strategy( self.gs.points, self.gs.nclusters))) sqdists = kmeans.get_point_center_sqdists(self.gs.points, centers) labels = kmeans.get_labels(sqdists) wcss, labels = kmeans.lloyd(self.gs.points, labels) if (self.best_wcss is None) or (wcss < self.best_wcss): self.best_wcss = wcss self.best_labels = labels # do not stop return False
def onlinekmeans(data, k, max_iter=100, b_size=100, centroids_init=None): k_centroid = generate_random_centers(data, k) if centroids_init is None else centroids_init b_size = b_size if b_size <= len(data) else len(data) / 4 # Default as 1/4 of the data size d_map = {} # data_index => centroid_index center_count = [0 for i in xrange(k)] # v for i in xrange(max_iter): rndIdex = generate_random_centers(data=data, rndSize=b_size) for j in rndIdex: # Cache the center nearest to x d_map[j] = get_labels(x=data[j], k_centroids=k_centroid)[0] for j in rndIdex: # Get cached center for this x # c is actually the index of centroid of x c = d_map[j] center_count[c] += 1 eta = 1 / center_count[c] c_centroid = k_centroid[c] k_centroid[c] = (1 - eta) * c_centroid + eta * data[j] return k_centroid
import pandas as pd import matplotlib.pyplot as plt import sys from kmeans import kmeans, get_labels mean_01 = np.array([0.0, 0.0]) cov_01 = np.array([[1, 0.3], [0.3, 1]]) dist_01 = np.random.multivariate_normal(mean_01, cov_01, 100) mean_02 = np.array([6.0, 7.0]) cov_02 = np.array([[1.5, 0.3], [0.3, 1]]) dist_02 = np.random.multivariate_normal(mean_02, cov_02, 100) mean_03 = np.array([7.0, -5.0]) cov_03 = np.array([[1.2, 0.5], [0.5, 1, 3]]) dist_03 = np.random.multivariate_normal(mean_03, cov_01, 100) mean_04 = np.array([2.0, -7.0]) cov_04 = np.array([[1.2, 0.5], [0.5, 1, 3]]) dist_04 = np.random.multivariate_normal(mean_04, cov_01, 100) data = np.vstack((dist_01, dist_02, dist_03, dist_04)) np.random.shuffle(data) centroids = kmeans(data, 4) labels = get_labels(data, centroids) plt.scatter(data[:, 0], data[:, 1], c=labels) plt.show()