forked from ijkilchenko/numpy_or_bust
/
kmeans++.py
76 lines (59 loc) · 2.16 KB
/
kmeans++.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import numpy as np
import pandas as pd
from collections import defaultdict
from bokeh.charts import Scatter, show
def dist(x, y):
"""Euclidean distance between x and y. """
return np.linalg.norm(x-y)
def init_centroids(X, k):
centroids = [X[np.random.randint(0, len(X)-1)]] # First random point.
for i in range(1, k): # For each remaining centroid.
D = assign_to_clusters(X, centroids)
# Choose new centroid point (from X) according to weights in D.
D = [d[0] for d in D]
D_sum = sum(D)
D = [d/D_sum for d in D]
centroids.append(X[np.random.choice(range(len(X)), p=D)])
return centroids
def assign_to_clusters(X, centroids):
"""For each data point, calculate the distance to the closest
centroid, and put that distance into D. """
D = []
for j, x in enumerate(X):
for m, centroid in enumerate(centroids):
curr_dist = dist(centroid, x)
if len(D) - 1 < j:
D.append((curr_dist, m))
else:
if D[-1][0] > curr_dist:
D[-1] = (curr_dist, m)
return D
def recalc_centroids(X, D):
cluster_to_indices = defaultdict(lambda: [])
for i, d in enumerate(D):
cluster_to_indices[d[1]].append(i)
centroids = []
for cluster in cluster_to_indices:
new_cluster = [0]*len(X[0])
for x in cluster_to_indices[cluster]:
x = X[x]
for i, x_i in enumerate(x):
new_cluster[i] += x_i
new_cluster = [c/len(cluster_to_indices[cluster]) for c in new_cluster]
centroids.append(new_cluster)
return centroids
def kmeanspp(X, k, num_iter=100):
centroids = init_centroids(X, k)
for _ in range(num_iter):
D = assign_to_clusters(X, centroids)
centroids = recalc_centroids(X, D)
df = pd.DataFrame(X)
D = [d[1] for d in D]
df = pd.concat([df, pd.Series(D)], axis=1)
df.columns = ['x', 'y', 'cluster']
p = Scatter(df, x='x', y='y', color='cluster')
show(p)
if __name__ == '__main__':
X = [np.array([np.random.uniform(0, 100), np.random.uniform(0, 100)])
for _ in range(500)]
kmeanspp(X, k=5)