def bench_k_means(estimator, name, data): t0 = time() estimator.fit(data) print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size)))
af = AffinityPropagation(preference=-50).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) # ############################################################################# # Plot result import matplotlib.pyplot as plt from itertools import cycle plt.close('all') plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) # ############################################################################# # Plot result import matplotlib.pyplot as plt # Black removed and is used for noise instead. unique_labels = set(labels) colors = [ plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels)) ] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1]
# The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort()
n_init=1, verbose=opts.verbose) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) print() if not opts.use_hashing: print("Top terms per cluster:") if opts.n_components: original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i, end='')