Example #1
0
 def test_ax(self):
     np.random.seed(0)
     clf = KMeans()
     cluster_labels = clf.fit_predict(self.X)
     plot_silhouette(self.X, cluster_labels)
     fig, ax = plt.subplots(1, 1)
     out_ax = plot_silhouette(self.X, cluster_labels)
     assert ax is not out_ax
     out_ax = plot_silhouette(self.X, cluster_labels, ax=ax)
     assert ax is out_ax
Example #2
0
 def test_array_like(self):
     plot_silhouette(self.X.tolist(), self.y.tolist())
     plot_silhouette(self.X.tolist(), convert_labels_into_string(self.y))
Example #3
0
 def test_cmap(self):
     np.random.seed(0)
     clf = KMeans()
     cluster_labels = clf.fit_predict(self.X)
     plot_silhouette(self.X, cluster_labels, cmap='Spectral')
     plot_silhouette(self.X, cluster_labels, cmap=plt.cm.Spectral)
Example #4
0
 def test_string_classes(self):
     np.random.seed(0)
     clf = KMeans()
     cluster_labels = clf.fit_predict(self.X)
     plot_silhouette(self.X, convert_labels_into_string(cluster_labels))
Example #5
0
 def test_plot_silhouette(self):
     np.random.seed(0)
     clf = KMeans()
     cluster_labels = clf.fit_predict(self.X)
     plot_silhouette(self.X, cluster_labels)
Example #6
0
def cluster_and_plot_pca(df,
                         cluster_range=np.arange(2, 9),
                         ClusterAlgorithm=KMeans,
                         cluster_kwargs=cluster_kwargs,
                         show=True):
    '''An unsupervised learning approach to visualizing the number of clusters
    that are appropriate for a given dataset `df`.  If using another
    ClusterAlgorithm than KMeans it must accept the kwarg n_clusters.

    Data should have no missing values and all columns should have already
    been transformed into numeric datatypes (i.e. converting categorical
    features into one-hot encoded vectors).

        Example:
            from sklearn.cluster import MiniBatchKMeans

            clust_kwargs = dict(random_state=77)
            cluster_and_plot_pca(wine_df,
                                ClusterAlgorithm=MiniBatchKMeans,
                                cluster_kwargs=clust_kwargs)

    ARGS:
        df <pd.DataFrame>: DataFrame with all numeric data types and no
            missing values.
    KWARGS:
        cluster_range <list> or <array>: List of integers signifying the number
            of clusters to test in sequence.
        ClusterAlgorithm <sklearn.cluster>: Currently supports KMeans
            and MiniBatchKMeans.
        cluster_kwargs <dict>: kwargs for ClusterAlgorithm, not including
            num_clusters.
    RETURNS:
        None.  Plots are printed to the console.
    '''
    pca = PCA(n_components=2, random_state=777)
    X_pca = pd.DataFrame(pca.fit(df).transform(df))

    for n_clusters in cluster_range:
        clust_col = "clusters_"+str(n_clusters)

        # perform kmeans or some other clustering algorithm
        clust = ClusterAlgorithm(n_clusters=n_clusters, **cluster_kwargs)
        clust.fit(df)
        X_pca[clust_col] = clust.labels_

        # plot PCA with segments
        fig, ax = plt.subplots()
        for _, x in X_pca.groupby(clust_col):
            ax.scatter(x[0], x[1], label=str(_), alpha=.6)
        ax.grid(alpha=.4)
        sns.despine()
        ax.set_title(f"Clusters={n_clusters} Projected on 2D Principal Components",
                    size=12)
        ax.set_xlabel("PC1")
        ax.set_ylabel("PC2")
        ax.legend(loc="best")

        # plot silhouette plot
        plot_silhouette(X_pca, clust.labels_)
        ax = plt.gca()
        ax.grid(alpha=.4)
        sns.despine()
        ax.set_title(f"Silhouette Plot Clusters={n_clusters}", size=12)

    if show:
        plt.show()

    # return the clustering algorithm for predictions
    return clust