def clustering(fname="clustering.png"): # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18,6)) X, y = make_blobs(centers=7) # Add K-Elbow to the left oz = KElbowVisualizer(MiniBatchKMeans(), k=(3,12), ax=axes[0]) oz.fit(X, y) oz.finalize() # Add SilhouetteVisualizer to the right oz = SilhouetteVisualizer(Birch(n_clusters=5), ax=axes[1]) oz.fit(X, y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
def clustering(fname="clustering.png"): # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18, 6)) X, y = make_blobs(centers=7) # Add K-Elbow to the left oz = KElbowVisualizer(MiniBatchKMeans(), k=(3, 12), ax=axes[0]) oz.fit(X, y) oz.finalize() # Add SilhouetteVisualizer to the right oz = SilhouetteVisualizer(Birch(n_clusters=5), ax=axes[1]) oz.fit(X, y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
if len(feature_names) > 10: k_range = range(2, len(feature_names) + 1) else: k_range = range(2, 2 * len(feature_names) + 1) # try bigger range for the small-d dataset # find optimal k with elbow method for metric in ('distortion', 'silhouette', 'calinski_harabasz'): print("# Optimizing k for " + label + " with " + metric) model = cluster.KMeans(precompute_distances=True, random_state=SEED, n_jobs=-1) try: visualizer = KElbowVisualizer(model, k=k_range, metric=metric, locate_elbow=True) except: visualizer = KElbowVisualizer(model, k=k_range, metric=metric, locate_elbow=False) visualizer.fit(X) visualizer.ax.xaxis.set_major_locator(MaxNLocator(integer=True)) visualizer.finalize() plt.savefig(path.join(PLOT_DIR, abbrev + "_km-rp_elbow_" + metric + ".png"), bbox_inches='tight') visualizer.show() plt.close() # predict best clusters print("# Clustering " + label) model = cluster.KMeans(n_clusters=best_k, precompute_distances=True, random_state=SEED, n_jobs=-1) start_time = time.perf_counter() y_pred = model.fit_predict(X) run_time = time.perf_counter() - start_time print(label + ": run time = " + str(run_time)) print(label + ": iterations until convergence = " + str(model.n_iter_)) df = X.assign(cluster=y_pred) df.to_pickle(path.join(PKL_DIR, abbrev + "_km-rp.pickle"))