def fit(self, samples): """ Apply Spectral Clustering algorithm as described in Ng et al. 2002 Affinity matrix calculated with scaling parameter as described in Zelnik-Manor et al. 2005 Clustering algorithm used: custom simple KMeans :param samples: data samples to cluster :return: labels associated with cluster """ self.samples = samples self.samples_len = len(samples) # Compute affinity matrix (A) affinity = self._affinity_matrix() # Square root of diagonal matrix (D) composed of the sum of each of A's rows => D^1/2 d = np.diag(np.power(np.sum(affinity, axis=0), -1 / 2)) # Compute laplacian matrix (L) using formula L = D^1/2 . A . D^1/2 laplacian = d @ affinity @ d # Eigenvectors of L stacked as a matrix (X) _, eig_vecs = sp.sparse.linalg.eigs(laplacian, k=self.k) # Normalize X using formula X / sum(X^2)^1/2 which gives us a data sample representation (Y) normalized_eig_vecs = eig_vecs / np.linalg.norm( eig_vecs, axis=1, keepdims=True) # Fit a KMeans algorithm to Y and receive cluster labels kmeans = KMeans(k=self.k) return kmeans.fit(normalized_eig_vecs)
import numpy import pylab from unsupervised.kmeans import KMeans if __name__ == "__main__": numpy.random.seed(1) X = numpy.vstack((numpy.random.randn(10000, 2)*0.3, numpy.random.randn(10000, 2)*0.3 + numpy.ones(2))) estimator = KMeans(2, 200, 10) estimator.fit(X) print estimator.C_ print estimator.v Y = estimator.predict(X) print Y pylab.plot(X[:, 0], X[:, 1], "o") pylab.plot([estimator.C_[0, 0]], [estimator.C_[0, 1]], "o") pylab.plot([estimator.C_[1, 0]], [estimator.C_[1, 1]], "o") pylab.show()
data = numpy.fmax(numpy.fmin(data, pstd), -pstd) / pstd data = (data + 1) * 0.4 + 0.1; return data images = normalize_data(images) patch_width = 8 n_filters = 25 n_samples, n_rows, n_cols = images.shape n_features = n_rows * n_cols patches = [extract_patches_2d(images[i], (patch_width, patch_width), max_patches=1000, random_state=i) for i in range(n_samples)] patches = numpy.array(patches).reshape(-1, patch_width * patch_width) print("Dataset consists of %d samples" % n_samples) estimator = KMeans(n_filters=n_filters, batch_size=1000, n_iterations=200) estimator.fit(patches) print estimator.predict(patches) pylab.figure() for i in range(estimator.C_.shape[0]): rows = max(int(numpy.sqrt(n_filters)), 2) cols = max(int(numpy.sqrt(n_filters)), 2) pylab.subplot(rows, cols, i + 1) pylab.imshow(estimator.C_[i].reshape(patch_width, patch_width), cmap=pylab.cm.gray, interpolation="nearest") pylab.xticks(()) pylab.yticks(()) pylab.show()
from unsupervised.kmeans import KMeans from tools import load_mnist, scale_features, test_classifier if __name__ == "__main__": numpy.random.seed(0) train_images, T = load_mnist("training", 60000) test_images, T2 = load_mnist("testing", 10000) print "Dataset loaded" train_cluster = train_images[:10000] train_classifier = train_images label_classifier = T n_filters = 196 estimator = KMeans(n_filters=n_filters, batch_size=1000, n_iterations=10) estimator.fit(train_cluster) X = estimator.predict(train_classifier) X2 = estimator.predict(test_images) X_mean = X.mean(axis=0) X_std = X.std(axis=0) + 1e-8 X = scale_features(X, X_mean, X_std) X2 = scale_features(X2, X_mean, X_std) print "Transformed datasets" test_classifier(X, label_classifier, X2, T2) pylab.figure() pylab.subplots_adjust(wspace=0.0, hspace=0.0) n_cells = numpy.min((int(numpy.sqrt(n_filters)), 10)) for i in range(n_cells**2):
import numpy as np import matplotlib.pyplot as plt from unsupervised.kmeans import KMeans n_samples = 100 n_features = 2 X = np.random.rand(n_samples, n_features) inertias = [] for k in range(1, 10): # len(X)): kmeans = KMeans(k=k) kmeans.fit(X) print(kmeans.inertia_) inertias.append(kmeans.inertia_) plt.figure(figsize=(10, 20)) plt.plot(range(1, len(inertias) + 1), inertias) plt.show()
parser.add_argument('--center', type=int, help='Number of data centers.', default=3) parser.add_argument('--random_state', type=int, help='Random state for data generation.', default=42) parser.add_argument('--n_samples', type=int, help='Number of data points.', default=5000) args = parser.parse_args() # Setting parameters max_iterations = args.max_iter n_centers = args.center n_samples = args.n_samples random_state = args.random_state # Create the clusters X, y = make_blobs(n_samples=n_samples, centers=n_centers, n_features=2, random_state=random_state, cluster_std=1.5) # Clustering kmeans = KMeans(k=n_centers, iterations=max_iterations, random_state=random_state, track_history=True) kmeans.fit(X) # Extract centroids centroids = kmeans.history_centroids # Create decision boundary data h = .1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) area_data = np.c_[xx.ravel(), yy.ravel()] # Prepare predictions predicted_labels = [] predicted_area = []