#number of clusters must be determined beforehand labels = KMeans(6, random_state=0).fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis') #k-means is limited to linear cluster boundaries. The fundamental model assumptions of k-means (points will be closer to their own cluster center than to others) means that the algorithm will often be ineffective if the clusters have complicated geometries. from sklearn.datasets import make_moons X, y = make_moons(200, noise=.05, random_state=0) labels = KMeans(2, random_state=0).fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis') #kernalized k-means - higher dimensional representation of the data before k-means algorithm from sklearn.cluster import SpectralClustering model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', assign_labels='kmeans') labels = model.fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis') #k-means on digits with no a priori knowledge of lables from sklearn.datasets import load_digits digits = load_digits() digits.data.shape kmeans = KMeans(n_clusters=10, random_state=0) clusters = kmeans.fit_predict(digits.data) kmeans.cluster_centers_.shape #result is 10 clusters in 64 dimensions #typical digit: fig, ax = plt.subplots(2, 5, figsize=(8, 3)) centers = kmeans.cluster_centers_.reshape(10, 8, 8)
# min_errors.append(min_error) # # plt.figure() # plt.plot(list(range(1, 11)), min_errors) # plt.xlabel('k') # plt.ylabel('Error') # plt.title('Minimum k-medians error as k increases from 1 to 10 over 50 initializations') # # fname = os.path.join("..", "figs", "k_medians_cluster_errors.png") # plt.savefig(fname) elif question == '3.4': X = load_dataset('clusterData2.pkl')['X'] model = DBSCAN(eps=1, min_samples=3) y = model.fit_predict(X) print("Labels (-1 is unassigned):", np.unique(model.labels_)) plot_2dclustering(X,y) fname = os.path.join("..", "figs", "clusterdata_dbscan.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) elif question == '4': img = imread(os.path.join("..", "data", "mandrill.jpg")) # part 1: implement quantize_image.py # part 2: use it on the doge for b in [1,2,4,6]: