Example #1
0
 def __call__(self):
     """
     Do an iteration of the Lloyd algorithm.
     """
     centers = np.array(list(self.gs.init_strategy(
         self.gs.points, self.gs.nclusters)))
     sqdists = kmeans.get_point_center_sqdists(self.gs.points, centers)
     labels = kmeans.get_labels(sqdists)
     wcss, labels = kmeans.lloyd(self.gs.points, labels)
     if (self.best_wcss is None) or (wcss < self.best_wcss):
         self.best_wcss = wcss
         self.best_labels = labels
     # do not stop
     return False
def onlinekmeans(data, k, max_iter=100, b_size=100, centroids_init=None):
    k_centroid = generate_random_centers(data, k) if centroids_init is None else centroids_init
    b_size = b_size if b_size <= len(data) else len(data) / 4  # Default as 1/4 of the data size
    d_map = {}  # data_index => centroid_index
    center_count = [0 for i in xrange(k)]  # v
    for i in xrange(max_iter):
        rndIdex = generate_random_centers(data=data, rndSize=b_size)
        for j in rndIdex:
            # Cache the center nearest to x
            d_map[j] = get_labels(x=data[j], k_centroids=k_centroid)[0] 
        for j in rndIdex:
            # Get cached center for this x
            # c is actually the index of centroid of x 
            c = d_map[j]  
            center_count[c] += 1
            eta = 1 / center_count[c]
            c_centroid = k_centroid[c]
            k_centroid[c] = (1 - eta) * c_centroid + eta * data[j]
    return k_centroid
Example #3
0
import pandas as pd
import matplotlib.pyplot as plt
import sys
from kmeans import kmeans, get_labels

mean_01 = np.array([0.0, 0.0])
cov_01 = np.array([[1, 0.3], [0.3, 1]])
dist_01 = np.random.multivariate_normal(mean_01, cov_01, 100)

mean_02 = np.array([6.0, 7.0])
cov_02 = np.array([[1.5, 0.3], [0.3, 1]])
dist_02 = np.random.multivariate_normal(mean_02, cov_02, 100)

mean_03 = np.array([7.0, -5.0])
cov_03 = np.array([[1.2, 0.5], [0.5, 1, 3]])
dist_03 = np.random.multivariate_normal(mean_03, cov_01, 100)

mean_04 = np.array([2.0, -7.0])
cov_04 = np.array([[1.2, 0.5], [0.5, 1, 3]])
dist_04 = np.random.multivariate_normal(mean_04, cov_01, 100)

data = np.vstack((dist_01, dist_02, dist_03, dist_04))
np.random.shuffle(data)

centroids = kmeans(data, 4)
labels = get_labels(data, centroids)

plt.scatter(data[:, 0], data[:, 1], c=labels)

plt.show()