コード例 #1
0
def clustering(fname="clustering.png"):
    # Create side-by-side axes grid
    _, axes = plt.subplots(ncols=2, figsize=(18,6))
    X, y = make_blobs(centers=7)

    # Add K-Elbow to the left
    oz = KElbowVisualizer(MiniBatchKMeans(), k=(3,12), ax=axes[0])
    oz.fit(X, y)
    oz.finalize()

    # Add SilhouetteVisualizer to the right
    oz = SilhouetteVisualizer(Birch(n_clusters=5), ax=axes[1])
    oz.fit(X, y)
    oz.finalize()

    # Save figure
    path = os.path.join(FIGURES, fname)
    plt.tight_layout()
    plt.savefig(path)
コード例 #2
0
def clustering(fname="clustering.png"):
    # Create side-by-side axes grid
    _, axes = plt.subplots(ncols=2, figsize=(18, 6))
    X, y = make_blobs(centers=7)

    # Add K-Elbow to the left
    oz = KElbowVisualizer(MiniBatchKMeans(), k=(3, 12), ax=axes[0])
    oz.fit(X, y)
    oz.finalize()

    # Add SilhouetteVisualizer to the right
    oz = SilhouetteVisualizer(Birch(n_clusters=5), ax=axes[1])
    oz.fit(X, y)
    oz.finalize()

    # Save figure
    path = os.path.join(FIGURES, fname)
    plt.tight_layout()
    plt.savefig(path)
コード例 #3
0
		if len(feature_names) > 10:
			k_range = range(2, len(feature_names) + 1)
		else:
			k_range = range(2, 2 * len(feature_names) + 1)  # try bigger range for the small-d dataset

		# find optimal k with elbow method
		for metric in ('distortion', 'silhouette', 'calinski_harabasz'):
			print("# Optimizing k for " + label + " with " + metric)
			model = cluster.KMeans(precompute_distances=True, random_state=SEED, n_jobs=-1)
			try:
				visualizer = KElbowVisualizer(model, k=k_range, metric=metric, locate_elbow=True)
			except:
				visualizer = KElbowVisualizer(model, k=k_range, metric=metric, locate_elbow=False)
			visualizer.fit(X)
			visualizer.ax.xaxis.set_major_locator(MaxNLocator(integer=True))
			visualizer.finalize()
			plt.savefig(path.join(PLOT_DIR, abbrev + "_km-rp_elbow_" + metric + ".png"), bbox_inches='tight')
			visualizer.show()
			plt.close()

		# predict best clusters
		print("# Clustering " + label)
		model = cluster.KMeans(n_clusters=best_k, precompute_distances=True, random_state=SEED, n_jobs=-1)
		start_time = time.perf_counter()
		y_pred = model.fit_predict(X)
		run_time = time.perf_counter() - start_time
		print(label + ": run time = " + str(run_time))
		print(label + ": iterations until convergence = " + str(model.n_iter_))
		df = X.assign(cluster=y_pred)
		df.to_pickle(path.join(PKL_DIR, abbrev + "_km-rp.pickle"))