def test_kmeans_fit(): dataset = datasets.load_iris() scaler = StandardScaler() x = scaler.fit_transform(dataset.data) clf = KMeans(3) clf.fit(x) assert len(clf.centers) == 3 assert clf.n_clusters == 3 assert np.array(clf.centers).shape == (3, 4) assert (set(clf.centers[0]) != set(clf.centers[1]) != set(clf.centers[2]) != set(clf.centers[0]))
def main(): inputs = [[-14, -5], [13, 13], [20, 23], [-19, -11], [-9, -16], [21, 27], [-49, 15], [26, 13], [-46, 5], [-34, -1], [11, 15], [-49, 0], [-22, -16], [19, 28], [-12, -8], [-13, -19], [-41, 8], [-11, -6], [-25, -9], [-18, -3]] random.seed(0) clusterer = KMeans(3) clusterer.train(inputs) print("3-means:") print(clusterer.means) print() plot_squared_clustering_errors(inputs)
def run_kmeans(data_path): img = imread(data_path) img2 = img.reshape(img.shape[0] * img.shape[1], img.shape[2]) # Cluster for K = 2,3,...,16 save the image to see differences for cluster_count in range(2, 17): km = KMeans(cluster_count) V, cmap = km.fit(img2) if cluster_count in [2, 4, 6, 8, 16]: save_image( V, cmap, img.shape, "result/kmeans_{0}_clusters.png".format(cluster_count), )
def test_kmean_predict(): np.random.seed(42) dataset = datasets.load_iris() scaler = StandardScaler() x = scaler.fit_transform(dataset.data) y = dataset.target clf = KMeans(3) clf.fit(x) centers = clf.centers y2_pred = clf.predict(x[20:30]) assert len(x[20:30]) == len(y2_pred) y_pred = clf.predict(x) assert set(y_pred) == set(y) assert clf.n_clusters == 3 assert np.array_equiv(clf.centers, centers) for cluster in range(clf.n_clusters): for point_n in np.random.choice(clf.clusters[cluster], size=min(20, len(clf.clusters[cluster])), replace=False): point = x[point_n] dist_internal = np.linalg.norm(clf.centers[cluster] - point) dist_external = min( np.linalg.norm(clf.centers[(cluster + 1) % 3] - point), np.linalg.norm(clf.centers[(cluster + 2) % 3] - point)) assert dist_internal <= dist_external
def main(): df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None) y = df.iloc[:, 4].values y[y == "Iris-setosa"] = 0 y[y == "Iris-versicolor"] = 1 y[y == "Iris-virginica"] = 2 x = df.iloc[:, [0,2]].values x = np.array(x) y = np.array(y) x[:, 0] = (x[:, 0] - x[:, 0].min())/(x[:, 0].max() - x[:, 0].min()) x[:, 1] = (x[:, 1] - x[:, 1].min())/(x[:, 1].max() - x[:, 1].min()) clf = KMeans.KMeans(n_clusters=3) clf.fit(x) x1_min, x1_max = x[:, 0].min(), x[:, 0].max() x2_min, x2_max = x[:, 1].min(), x[:, 1].max() x1_range = np.arange(x1_min, x1_max, 0.02) x2_range = np.arange(x2_min, x2_max, 0.02) xx, yy = np.meshgrid(x1_range, x2_range) x_ = np.array([xx.ravel(), yy.ravel()]).T z = clf.predict(x_) z = z.reshape(xx.shape) plt.contourf(xx, yy, z) colors = ['g', 'm', 'y'] for idx in range(len(np.unique(y))): plt.scatter(x[:, 0][y == idx], x[:, 1][y == idx], c=colors[idx], s=10, edgecolors='k') for k in clf.centroids: plt.scatter(clf.centroids[k][0], clf.centroids[k][1], marker='x', s=100, c='b') print clf.centroids[k] plt.title('K-Means') plt.xlabel('sepal length [standardized]') plt.ylabel('petal length [standardized]') plt.legend(loc='upper left') plt.show()
def test_kmeans_init(): clf = KMeans(3) assert clf.n_clusters == 3 assert len(clf.clusters) == 3 assert clf.centers == []
def kmeans_comparison_credita(): X, y = datasets.load_credita() results = [] # PCA -------------------------- # transform dataset with our PCA into 2 components # for better visualization in 2 dimensions pca = PCA(2, verbose=True) X_pca_2d = pca.fit_transform(X) # the 2 last components are significantly lower than the rest # thus, they do not bring relevant (enough) information and can # be truncated safely k = X.shape[1] - 2 pca2 = PCA(k, verbose=True) X_pca = pca2.fit_transform(X) # run KMeans on original dataset kmeans = KMeans(k=2, n_init=50) y_pred = kmeans.fit_predict(X) results.append(('KMeans', y, y_pred)) # run KMeans on reduced dataset kmeans = KMeans(k=2, n_init=50) y_pred = kmeans.fit_predict(X_pca) results.append(('KMeans-PCA', y, y_pred)) # t-SNE ------------------------ # transform dataset with t-SNE best_tsne = TSNE(2) tsne = best_tsne # run several times and keep the best result for _ in range(10): res = tsne.fit_transform(X) if tsne.kl_divergence_ <= best_tsne.kl_divergence_: best_tsne = tsne X_tsne = res tsne = TSNE(2) # run KMeans on reduced dataset kmeans = KMeans(k=2, n_init=50) y_pred = kmeans.fit_predict(X_tsne) results.append(('KMeans-TSNE', y, y_pred)) print('\n\n') print_binary_metrics(X, results) # plot fig, ax = plt.subplots(2, 3, figsize=(15, 10)) plt.subplots_adjust(bottom=.10, left=.05, top=.90, right=.95) # PCA -------------------------- ax[0, 0].scatter(X_pca_2d[y == 0, 0], X_pca_2d[y == 0, 1]) ax[0, 0].scatter(X_pca_2d[y == 1, 0], X_pca_2d[y == 1, 1]) ax[0, 0].title.set_text('Credit-A PCA') y_pred = results[0][2] ax[0, 1].scatter(X_pca_2d[y_pred == 0, 0], X_pca_2d[y_pred == 0, 1]) ax[0, 1].scatter(X_pca_2d[y_pred == 1, 0], X_pca_2d[y_pred == 1, 1]) ax[0, 1].title.set_text('KMeans on original dataset') y_pred = 1 - results[1][2] # we'll invert labels for visualization purposes ax[0, 2].scatter(X_pca_2d[y_pred == 0, 0], X_pca_2d[y_pred == 0, 1]) ax[0, 2].scatter(X_pca_2d[y_pred == 1, 0], X_pca_2d[y_pred == 1, 1]) ax[0, 2].title.set_text('KMeans on dataset PCA') # t-SNE ------------------------- ax[1, 0].scatter(X_tsne[y == 0, 0], X_tsne[y == 0, 1]) ax[1, 0].scatter(X_tsne[y == 1, 0], X_tsne[y == 1, 1]) ax[1, 0].title.set_text('Credit-A t-SNE') y_pred = results[0][2] ax[1, 1].scatter(X_tsne[y_pred == 0, 0], X_tsne[y_pred == 0, 1]) ax[1, 1].scatter(X_tsne[y_pred == 1, 0], X_tsne[y_pred == 1, 1]) ax[1, 1].title.set_text('KMeans on original dataset') y_pred = results[2][2] # we'll invert labels for visualization purposes ax[1, 2].scatter(X_tsne[y_pred == 0, 0], X_tsne[y_pred == 0, 1]) ax[1, 2].scatter(X_tsne[y_pred == 1, 0], X_tsne[y_pred == 1, 1]) ax[1, 2].title.set_text('KMeans on dataset t-SNE') plt.show()
def kmeans_comparison_satimage(): results = [] X, y = datasets.load_satimage() y = y.values.reshape(-1) pca = PCA(2, verbose=True) X_trans = pca.fit_transform(X) kmeans = KMeans(k=6, n_init=20) y_pred = kmeans.fit_predict(X) results.append(('KMeans', y, y_pred)) kmeans = KMeans(k=6, n_init=20) y_pred_PCA = kmeans.fit_predict(X_trans) results.append(('KMeans-PCA', y, y_pred_PCA)) # t-SNE ------------------------ # transform dataset with t-SNE best_tsne = TSNE(2) tsne = best_tsne # run several times and keep the best result for _ in range(3): res = tsne.fit_transform(X) if tsne.kl_divergence_ <= best_tsne.kl_divergence_: best_tsne = tsne X_trans2 = res tsne = TSNE(2) # run KMeans on reduced dataset kmeans = KMeans(k=6, n_init=20) y_pred_tSNE = kmeans.fit_predict(X_trans2) results.append(('KMeans-TSNE', y, y_pred_tSNE)) print(results) print('\n\n') print_multi_metrics(X, results) #PCA figures fig = plt.figure(figsize=(15, 5)) ax = fig.add_subplot(1, 2, 1) ax.set_title('K-means hist cluster for SatImage') ax.hist(y_pred, zorder=3) ax.set_xlabel('Number of instances', fontsize=13) ax.set_ylabel('Clusters', fontsize=13) ax.grid(zorder=0) ax = fig.add_subplot(1, 2, 2) ax.set_title('K-means on original SatImage dataset') for i in np.unique(y_pred): ax.scatter(X_trans[y_pred == i, 0], X_trans[y_pred == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i)) ax.set_xlabel('X', fontsize=13) ax.set_ylabel('Y', fontsize=13) plt.show() #PCA figures fig = plt.figure(figsize=(15, 5)) ax = fig.add_subplot(1, 2, 1) ax.set_title('K-means hist cluster for SatImage') ax.hist(y_pred_PCA, zorder=3) ax.set_xlabel('Number of instances', fontsize=13) ax.set_ylabel('Clusters', fontsize=13) ax.grid(zorder=0) ax = fig.add_subplot(1, 2, 2) ax.set_title('K-means on PCA SatImage') for i in np.unique(y_pred_PCA): ax.scatter(X_trans[y_pred_PCA == i, 0], X_trans[y_pred_PCA == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i)) ax.set_xlabel('X', fontsize=13) ax.set_ylabel('Y', fontsize=13) plt.show() #t-SNE figures fig = plt.figure(figsize=(15, 5)) ax = fig.add_subplot(1, 2, 1) ax.set_title('K-means t-SNE for SatImage') for i in np.unique(y_pred_tSNE): ax.scatter(X_trans2[y_pred_tSNE == i, 0], X_trans2[y_pred_tSNE == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i)) ax.set_xlabel('X', fontsize=13) ax.set_ylabel('Y', fontsize=13) ax = fig.add_subplot(1, 2, 2) ax.set_title('K-means on orignal SatImage dataset') for i in np.unique(y_pred): ax.scatter(X_trans2[y_pred == i, 0], X_trans2[y_pred == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i)) ax.set_xlabel('X', fontsize=13) ax.set_ylabel('Y', fontsize=13) plt.show() #Ground Turth Figures fig = plt.figure(figsize=(15, 5)) ax = fig.add_subplot(1, 2, 1) ax.set_title('SatImage PCA Ground Truth') for i in np.unique(y): ax.scatter(X_trans[y == i, 0], X_trans[y == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i)) ax.set_xlabel('X', fontsize=13) ax.set_ylabel('Y', fontsize=13) ax = fig.add_subplot(1, 2, 2) ax.set_title('SatImage t-SNE Ground Truth') for i in np.unique(y): ax.scatter(X_trans2[y == i, 0], X_trans2[y == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i)) ax.set_xlabel('X', fontsize=13) ax.set_ylabel('Y', fontsize=13) plt.show()
def kmeans_comparison_kropt(): X, y = datasets.load_kropt() results = [] # from sklearn.metrics.cluster import davies_bouldin_score # print(davies_bouldin_score(X, y)) # Apply custom PCA for dimension reduction pca = PCA(n_components=2, verbose=True) X_trans = pca.fit_transform(X) # Apply K-Means to original data kmeans = KMeans(k=18, n_init=50) y_pred = kmeans.fit_predict(X) results.append(('KMeans', y, y_pred)) # Apply K-Means to transformed data kmeans = KMeans(k=18, n_init=50) y_pred_PCA = kmeans.fit_predict(X_trans) results.append(('KMeans-PCA', y, y_pred_PCA)) # t-SNE ------------------------ # transform dataset with t-SNE best_tsne = TSNE(2) tsne = best_tsne # run several times and keep the best result for _ in range(1): # Too much time duration with big dataset like Kropt res = tsne.fit_transform(X) if tsne.kl_divergence_ <= best_tsne.kl_divergence_: best_tsne = tsne X_trans2 = res tsne = TSNE(2) # run KMeans on reduced dataset kmeans = KMeans(k=18, n_init=50) y_pred_tsne = kmeans.fit_predict(X_trans2) results.append(('KMeans-TSNE', y, y_pred_tsne)) print('\n\n') print_multi_metrics(X, results) # Results Visualization total_colors = mcolors.cnames selected_colors = random.sample(list(total_colors), k=18) fig, ax = plt.subplots(2, 3, figsize=(30, 10)) # PCA result visualization cvec = [selected_colors[label] for label in y] ax[0, 0].scatter(X_trans[:, 0], X_trans[:, 1], c=cvec, alpha=0.5) ax[0, 0].title.set_text('Kropt Groundtruth - Visual PCA') ax[0, 0].set_xlabel('PC1') ax[0, 0].set_ylabel('PC2') cvec = [selected_colors[label] for label in y_pred] ax[0, 1].scatter(X_trans[:, 0], X_trans[:, 1], c=cvec, alpha=0.5) ax[0, 1].title.set_text('K-Means Clustering - Visual PCA') ax[0, 1].set_xlabel('PC1') ax[0, 1].set_ylabel('PC2') cvec = [selected_colors[label] for label in y_pred_PCA] ax[0, 2].scatter(X_trans[:, 0], X_trans[:, 1], c=cvec, alpha=0.5) ax[0, 2].title.set_text('K-Means PCA Kropt') ax[0, 2].set_xlabel('PC1') ax[0, 2].set_ylabel('PC2') # t-SNE result visualization cvec = [selected_colors[label] for label in y] ax[1, 0].scatter(X_trans2[:, 0], X_trans2[:, 1], c=cvec, alpha=0.5) ax[1, 0].title.set_text('Kropt Groundtruth - Visual t-SNE') ax[1, 0].set_xlabel('PC1') ax[1, 0].set_ylabel('PC2') cvec = [selected_colors[label] for label in y_pred] ax[1, 1].scatter(X_trans2[:, 0], X_trans2[:, 1], c=cvec, alpha=0.5) ax[1, 1].title.set_text('K-Means Clustering - Visual t-SNE') ax[1, 1].set_xlabel('PC1') ax[1, 1].set_ylabel('PC2') cvec = [selected_colors[label] for label in y_pred_tsne] ax[1, 2].scatter(X_trans2[:, 0], X_trans2[:, 1], c=cvec, alpha=0.5) ax[1, 2].title.set_text('K-Means t-SNE Kropt') ax[1, 2].set_xlabel('PC1') ax[1, 2].set_ylabel('PC2') plt.show()
def train_model(self, data_train, labels_train=None, data_test=None, labels_test=None, verbose=1, compiled=False, clustering_loss='kld', decoder_loss='mse', clustering_loss_weight=0.5, hardening_order=1, hardening_strength=2.0, compiled=False, optimizer='adam', lr=0.001, decay=0.0): """Train DCE Model: If labels_train are not present, train DCE model in a unsupervised learning process; otherwise, train DCE model in a supervised learning process. Args: data_train: input training data labels_train: true labels of traning data data_test: input test data labels_test: true lables of testing data verbose: 0, turn off the screen prints clustering_loss: string, clustering layer loss function decoder_loss:, string, decoder loss function clustering_loss_weight: float in [0,1], w_c, harderning_order: odd int, the order of hardening function harderning_strength: float >=1.0, the streng of the harderning compiled: boolean, indicating if the model is compiled or not optmizer: string, keras optimizers lr: learning rate dacay: learning rate dacay Returns: train_loss: training loss test_loss: only if data_test and labels_test are not None in supervised learning process """ if (not compiled): assert clustering_loss_weight <= 1 and clustering_loss_weight >= 0 if optimizer == 'adam': dce_optimizer = optimizers.Adam(lr=lr, decay=decay) elif optimizer == 'sgd': dce_optimizer = optimizers.sgd(lr=lr, decay=decay) else: raise Exception('Input optimizer was not found') self.model.compile(loss={ 'clustering': clustering_loss, 'decoder_output': decoder_loss }, loss_weights=[ clustering_loss_weight, 1 - clustering_loss_weight ], optimizer=dce_optimizer) if (labels_train is not None): supervised_learning = True if verbose >= 1: print('Starting supervised learning') else: supervised_learning = False if verbose >= 1: print('Starting unsupervised learning') # initializing model by using sklean-Kmeans as guess kmeans_init = KMeans(n_clusters=self.n_clusters) kmeans_init.build_model() encoder = Model(inputs=self.model.input, outputs=self.model.get_layer(\ name='embedding_layer').output) kmeans_init.model.fit(encoder.predict(data_train)) y_pred_last = kmeans_init.model.labels_ self.model.get_layer(name='clustering').\ set_weights([kmeans_init.model.cluster_centers_]) # Prepare training: p disctribution methods if not supervised_learning: # Unsupervised Learning assert hardening_order in DCE.HARDENING_FUNCS.keys() assert hardening_strength >= 1.0 h_func = DCE.HARDENING_FUNCS[hardening_order] else: # Supervised Learning assert len(labels_train) == len(data_train) assert len(np.unique(labels_train)) == self.n_clusters p = np.zeros(shape=(len(labels_train), self.n_clusters)) for i in range(len(labels_train)): p[i][labels_train[i]] = 1.0 if data_test is not None: assert len(labels_test) == len(data_test) assert len(np.unique(labels_test)) == self.n_clusters p_test = np.zeros(shape=(len(labels_test), self.n_clusters)) for i in range(len(labels_test)): p_test[i][labels_test[i]] = 1.0 validation_loss = [] # training start: loss = [] for iteration in range(int(self.max_iteration)): if iteration % self.update_interval == 0: # updating p for unsupervised learning process q, _ = self.model.predict(data_train) if not supervised_learning: p = DCE.hardening(q, h_func, hardening_strength) # get label change i y_pred = q.argmax(1) delta_label_i = np.sum(y_pred != y_pred_last).\ astype(np.float32) / y_pred.shape[0] y_pred_last = y_pred # exam convergence if iteration > 0 and delta_label_i < self.clustering_tol: print( str(delta_label_i) + ' < ' + str(self.clustering_tol)) print('Reached tolerance threshold. Stopping training.') break loss.append( self.model.train_on_batch(x=data_train, y=[p, data_train])) if supervised_learning and data_test is not None: validation_loss.append( self.model.test_on_batch(x=data_test, y=[p_test, data_test])) if verbose > 0 and iteration % self.update_interval == 0: print('Epoch: ' + str(iteration)) if verbose == 1: print(' Total_loss = ' + str(loss[iteration][0]) + ';Delta_label = ' + str(delta_label_i)) print(' Clustering_loss = ' + str(loss[iteration][1]) + '; Decoder_loss = ' + str(loss[iteration][2])) if iteration == self.max_iteration - 1: print('Reached maximum iteration. Stopping training.') if data_test is None: return np.array(loss).T else: return [np.array(loss).T, np.array(validation_loss).T]
x_axis = numpy.arange(cluster.borders[0][0] - 1, cluster.borders[0][1] + 1, 0.1) # boundaries of x y_axis = numpy.arange(cluster.borders[1][0] - 1, cluster.borders[1][1] + 1, 0.1) # boundaries of y x_axis, y_axis = numpy.meshgrid(x_axis, y_axis) z = numpy.array([ cluster.predict([x, y]) for x, y in numpy.c_[x_axis.ravel(), y_axis.ravel()] ]) plt.pcolormesh(x_axis, y_axis, z.reshape(x_axis.shape), cmap=plt.cm.Paired) plt.show() if __name__ == '__main__': data = load("Cluster.csv") pca = PCA(n_components=2) data = pca.fit_transform(data) results = {} for n in [1, 5, 10, 20]: kmeans = KMeans(n_clusters=n).fit(data) print("k:", n, "\terror:", kmeans.cluster_error_) results[kmeans] = kmeans.cluster_error_, kmeans plot(min(results, key=lambda x: results[x]))