D = distance.squareform(distance.pdist(X)) S = 1 - (D / np.max(D)) ############################################################################## # Compute DBSCAN db = DBSCAN().fit(S, eps=0.95, min_samples=10) core_samples = db.core_sample_indices_ labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print 'Estimated number of clusters: %d' % n_clusters_ print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels) print "Completeness: %0.3f" % metrics.completeness_score(labels_true, labels) print "V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels) ############################################################################## # Plot result import pylab as pl from itertools import cycle pl.close('all') pl.figure(1) pl.clf() # Black removed and is used for noise instead. colors = cycle('bgrcmybgrcmybgrcmybgrcmy') for k, col in zip(set(labels), colors): if k == -1: # Black used for noise.
n_digits = len(np.unique(digits.target)) labels = digits.target print "n_digits: %d" % n_digits print "n_features: %d" % n_features print "n_samples: %d" % n_samples print print "Raw k-means with k-means++ init..." t0 = time() km = KMeans(init="k-means++", k=n_digits, n_init=10).fit(data) print "done in %0.3fs" % (time() - t0) print "Inertia: %f" % km.inertia_ print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_) print "Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_) print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_) print print "Raw k-means with random centroid init..." t0 = time() km = KMeans(init="random", k=n_digits, n_init=10).fit(data) print "done in %0.3fs" % (time() - t0) print "Inertia: %f" % km.inertia_ print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_) print "Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_) print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_) print print "Raw k-means with PCA-based centroid init..." # in this case the seeding of the centers is deterministic, hence we run the # kmeans algorithm only once with n_init=1
X = Normalizer(norm="l2", copy=False).transform(X) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape print ############################################################################### # Now sparse MiniBatchKmeans print "_" * 80 mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13, chunk_size=1000, tol=0.0, n_init=1) print "Clustering sparse data with %s" % str(mbkm) print t0 = time() mbkm.fit(X) print "done in %0.3fs" % (time() - t0) ri = randindex(labels, mbkm.labels_) vmeasure = metrics.v_measure_score(labels, mbkm.labels_) print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, mbkm.labels_) print "Completeness: %0.3f" % metrics.completeness_score(labels, mbkm.labels_) print "V-measure: %0.3f" % vmeasure print "Rand-Index: %.3f" % ri print
X_norms = np.sum(X**2, axis=1) S = -X_norms[:, np.newaxis] - X_norms[np.newaxis, :] + 2 * np.dot(X, X.T) p = 10 * np.median(S) ############################################################################## # Compute Affinity Propagation af = AffinityPropagation().fit(S, p) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print 'Estimated number of clusters: %d' % n_clusters_ print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels) print "Completeness: %0.3f" % metrics.completeness_score(labels_true, labels) print "V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels) ############################################################################## # Plot result import pylab as pl from itertools import cycle pl.close('all') pl.figure(1) pl.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] pl.plot(X[class_members, 0], X[class_members, 1], col + '.')
n_digits = len(np.unique(digits.target)) labels = digits.target print "n_digits: %d" % n_digits print "n_features: %d" % n_features print "n_samples: %d" % n_samples print print "Raw k-means with k-means++ init..." t0 = time() km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data) print "done in %0.3fs" % (time() - t0) print "Inertia: %f" % km.inertia_ print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_) print "Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_) print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_) print print "Raw k-means with random centroid init..." t0 = time() km = KMeans(init='random', k=n_digits, n_init=10).fit(data) print "done in %0.3fs" % (time() - t0) print "Inertia: %f" % km.inertia_ print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_) print "Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_) print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_) print print "Raw k-means with PCA-based centroid init..." # in this case the seeding of the centers is deterministic, hence we run the # kmeans algorithm only once with n_init=1