-
Notifications
You must be signed in to change notification settings - Fork 1
/
scikit_learn_3_clustering.py
288 lines (223 loc) · 8.43 KB
/
scikit_learn_3_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
### CLUSTERING
## K-Means
# cost function
# J = \sum_k\sum_{i\in C_k} ||x_i - \mu_k||^2
# i.e. sum of squares of Euclidean distance of
# observations to centroids.
# KM algorithm
# i) choose k, randomly select k initializer centroids;
# ii) assign all observations to centroids by Euclidean distance;
# iii) adjust centroids by observations assigned to it;
# iv) unless tired, go to ii), until centroids stop moving.
# KM demo on toy data (randomly generated, TODO: DATA GENERATION FIXING)
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
# import utilities
cluster1 = np.random.uniform(.5,1.5,(2,10))
cluster2 = np.random.uniform(3.5,4.5,(2,10))
# np.random.uniform(a,b,(<# of sets>,<# of obs in sets>))
X = np.hstack((cluster1,cluster2)).T
X = np.vstack((cluster1,cluster2)).T
# np.hstack: horizontal stacking
# x = np.array([1,2,3]); y = np.array([3,4,5])
# np.hstack((x,y))
# array([1, 2, 3, 3, 4, 5])
# np.vstack((x,y))
# array([[1, 2, 3],
# [3, 4, 5]])
K = range(1,10)
meandistortions = []
for k in K:
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
meandistortions.append(sum(np.min(cdist(X,kmeans.cluster_centers_,
'euclidean'),axis=1)) / X.shape[0])
# model building (for K=1,...,9)
plt.plot(K, meandistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
plt.show()
## Evaluation
# silouette coefficient
# s = ba/max(a,b)
# a: mean dist between instances in a cluster.
# b: mean dist between the instances in the cluster
# and the instances in the next closest cluster.
# a measure of the compactness and separation of
# the clusters. it is calculated for each instance,
# for a data set, it is calculated as the mean of
# the individual samples' scores.
# value range: [0,1]; the higher the better.
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics
import matplotlib.pyplot as plt
plt.subplot(3,2,1)
# subplot(nrows,ncols,fignum)
# subplot(211) = subplot(2,1,1)
# fignum: position of current picture.
x1 = np.array([1,2,3,1,5,6,5,5,6,7,8,9,7,9])
x2 = np.array([1,3,2,2,8,6,7,6,7,1,2,1,1,3])
# len: 14
X = np.array(zip(x1,x2)).reshape(len(x1),2)
# zip(..,..): 14 by 2 matrix
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Instances')
plt.scatter(x1,x2)
colors = ['b','g','r','c','m','y','k','b']
markers = ['o','s','D','v','^','p','*','+']
tests = [2,3,4,5,8]
subplot_counter = 1
for t in tests:
subplot_counter += 1
plt.subplot(3,2,subplot_counter)
kmeans_model = KMeans(n_clusters=t).fit(X)
for i, l in enumerate(kmeans_model.labels_):
plt.plot(x1[i],x2[i],color=colors[l],marker=markers[l],
ls='None')
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('K = %s, silouette coefs = %.03f' %
(t, metrics.silhouette_score(X,kmeans_model.labels_,
metric='euclidean')))
plt.show()
## Image Compression
# image quantization
# a lossy compression method that replaces a range
# similar colors (within-cluster members) in an image
# with a single color. it reduces the size of the image file.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
import mahotas as mh
# import utilities
original_img = np.array(mh.imread('image_quantization.png'),
dtype=np.float64) / 255
original_dimensions = tuple(original_img.shape)
width, height, depth = tuple(original_img.shape)
image_flattened = np.reshape(original_img,(width*height, depth))
# flatten image
image_array_sample = shuffle(image_flattened,
random_state=0)[:1000]
estimator = KMeans(n_clusters=64, random_state=0)
estimator.fit(image_array_sample)
# create 64 clusters from a sample of 1000 randomly
# selected colors.
cluster_assignments = estimator.predict(image_flattened)
compressed_palette = estimator.cluster_centers_
compressed_img = np.zeros((width,height,compressed_palette.shape[1]))
label_idx = 0
for i in range(width):
for j in range(height):
compressed_img[i][j] = compressed_palette[cluster_assignments[label_idx]]
label_idx += 1
# fill the cluster centroids to compressed image by width and height
plt.subplot(122)
plt.title('Original')
plt.imshow(original_img)
plt.axis('off')
plt.subplot(121)
plt.title('Compressed')
plt.imshow(compressed_img)
plt.axis('off')
plt.show()
## Semi-Supervised Learning: Cluster + Classification
# procedure:
# i) learn features by clustering unlabeled data.
# (Phase I and II)
# ii) use learned features to build a supervised classifier.
# (Phase III)
# Phase I: clustering
# representation of images
# i) each image is broken down into some descriptors which
# which be clustered.
# ii) the image is then represented with a vector with
# one element for each cluster (each element will
# encode the number of descriptors extracted from
# the image that were assigned to the cluster).
# current test
# train .6; test .4
# at Phase I, we do i).
import numpy as np
import mahotas as mh
from mahotas.features import surf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.cluster import MiniBatchKMeans
# MiniBatchKMeans
# computes the distances to the centroids for
# only a sample of the instances in each iteration.
# the benefit is faster convergence.
import glob
# import utilities
all_instance_filenames = []
all_instance_targets = []
for f in glob.glob('cat_dog_images/*.jpg'):
target = 1 if 'cat' in f else 0
all_instance_filenames.append(f)
all_instance_targets.append(target)
# sort images by indexing cat 1 dog 0
surf_features = []
counter = 0
for f in all_instance_filenames:
print 'Reading image:', f
image = mh.imread(f, as_grey=True)
surf_features.append(surf.surf(image)[:,5:])
# only save some of the descriptors
train_len = int(len(all_instance_filenames)*.6)
X_train_surf_features = np.concatenate(surf_features[:train_len])
X_test_surf_features = np.concatenate(surf_features[train_len:])
y_train = all_instance_targets[:train_len]
y_test = all_instance_targets[train_len:]
# training-testing split
n_clusters = 300
print 'Clustering', len(X_train_surf_features), 'features'
estimator = MiniBatchKMeans(n_clusters=n_clusters)
estimator.fit_transform(X_train_surf_features)
# model building
# Phase II: feature learning
# at Phase II we do ii).
# we end up with a list of images, each is represented
# as a bincounted list of cluster numberings -- for instance,
# feature 1 feature 2 ... feature n
# [ 25 340 ... 210 ]
X_train = []
for instance in surf_features[:train_len]:
# each instance is a surf_feature representation of an image.
clusters = estimator.predict(instance)
# a 'clusters' object is a after-clustering feature
# vector of the image.
features = np.bincount(clusters)
# bincount is a list of length n_clusters (300 in this case).
# in each bin is the number of ith feature the image have.
if len(features) < n_clusters:
features = np.append(features,
np.zeros((1,n_clusters-len(features))))
X_train.append(features)
# X_train ends up as a list of lists bincounted features.
X_test = []
for instance in surf_features[train_len:]:
clusters = estimator.predict(instance)
features = np.bincount(clusters)
if len(features) < n_clusters:
features = np.append(features,
np.zeros((1,n_clusters-len(features))))
X_test.append(features)
# same operation on X_test
# Phase III: classification
clf = LogisticRegression(C=.001, penalty='l2')
clf.fit_transform(X_train, y_train)
predictions = clf.predict(X_test)
print classification_report(y_test, predictions)
print 'Precision: ', precision_score(y_test, predictions)
print 'Recall: ', recall_score(y_test, predictions)
print 'Accuracy: ', accuracy_score(y_test, predictions)
# performance-wise, this outperforms classifiers that
# use only the pixel intensities as features, with much
# lower dimensional features too (300 here, 10,000 for
# even small images with 100 by 100 pixels.