def test_ax(self): np.random.seed(0) clf = KMeans() cluster_labels = clf.fit_predict(self.X) plot_silhouette(self.X, cluster_labels) fig, ax = plt.subplots(1, 1) out_ax = plot_silhouette(self.X, cluster_labels) assert ax is not out_ax out_ax = plot_silhouette(self.X, cluster_labels, ax=ax) assert ax is out_ax
def test_array_like(self): plot_silhouette(self.X.tolist(), self.y.tolist()) plot_silhouette(self.X.tolist(), convert_labels_into_string(self.y))
def test_cmap(self): np.random.seed(0) clf = KMeans() cluster_labels = clf.fit_predict(self.X) plot_silhouette(self.X, cluster_labels, cmap='Spectral') plot_silhouette(self.X, cluster_labels, cmap=plt.cm.Spectral)
def test_string_classes(self): np.random.seed(0) clf = KMeans() cluster_labels = clf.fit_predict(self.X) plot_silhouette(self.X, convert_labels_into_string(cluster_labels))
def test_plot_silhouette(self): np.random.seed(0) clf = KMeans() cluster_labels = clf.fit_predict(self.X) plot_silhouette(self.X, cluster_labels)
def cluster_and_plot_pca(df, cluster_range=np.arange(2, 9), ClusterAlgorithm=KMeans, cluster_kwargs=cluster_kwargs, show=True): '''An unsupervised learning approach to visualizing the number of clusters that are appropriate for a given dataset `df`. If using another ClusterAlgorithm than KMeans it must accept the kwarg n_clusters. Data should have no missing values and all columns should have already been transformed into numeric datatypes (i.e. converting categorical features into one-hot encoded vectors). Example: from sklearn.cluster import MiniBatchKMeans clust_kwargs = dict(random_state=77) cluster_and_plot_pca(wine_df, ClusterAlgorithm=MiniBatchKMeans, cluster_kwargs=clust_kwargs) ARGS: df <pd.DataFrame>: DataFrame with all numeric data types and no missing values. KWARGS: cluster_range <list> or <array>: List of integers signifying the number of clusters to test in sequence. ClusterAlgorithm <sklearn.cluster>: Currently supports KMeans and MiniBatchKMeans. cluster_kwargs <dict>: kwargs for ClusterAlgorithm, not including num_clusters. RETURNS: None. Plots are printed to the console. ''' pca = PCA(n_components=2, random_state=777) X_pca = pd.DataFrame(pca.fit(df).transform(df)) for n_clusters in cluster_range: clust_col = "clusters_"+str(n_clusters) # perform kmeans or some other clustering algorithm clust = ClusterAlgorithm(n_clusters=n_clusters, **cluster_kwargs) clust.fit(df) X_pca[clust_col] = clust.labels_ # plot PCA with segments fig, ax = plt.subplots() for _, x in X_pca.groupby(clust_col): ax.scatter(x[0], x[1], label=str(_), alpha=.6) ax.grid(alpha=.4) sns.despine() ax.set_title(f"Clusters={n_clusters} Projected on 2D Principal Components", size=12) ax.set_xlabel("PC1") ax.set_ylabel("PC2") ax.legend(loc="best") # plot silhouette plot plot_silhouette(X_pca, clust.labels_) ax = plt.gca() ax.grid(alpha=.4) sns.despine() ax.set_title(f"Silhouette Plot Clusters={n_clusters}", size=12) if show: plt.show() # return the clustering algorithm for predictions return clust