def do_kmeans(df, k):
    k_means = KMeans(init='k-means++', n_clusters=k, n_init=10, max_iter=1000, random_state=40)
    k_means.fit(df)
    wcss = k_means.inertia_
    sil = silhouette_score(df, k_means.labels_)
    
    plt.style.use('default');

    sample_silhouette_values = silhouette_samples(df, k_means.labels_)
    sizes = 200*sample_silhouette_values

    plt.figure(figsize=(16, 10));
    plt.grid(True);

    plt.scatter(df.iloc[:, 0], df.iloc[:, 1], s=sizes, c=k_means.labels_)
    plt.scatter(k_means.cluster_centers_[:, 0], k_means.cluster_centers_[:, 1], marker='x', s=300, c="black")

    plt.title("K-Means (K={}, WCSS={:.2f}, Sil={:.2f})".format(k, wcss, sil), fontsize=20);
    plt.xlabel('Age', fontsize=22);
    plt.ylabel('Income', fontsize=22);
    plt.xticks(fontsize=18);
    plt.yticks(fontsize=18);
    plt.show()
    
    visualizer = SilhouetteVisualizer(k_means)
    visualizer.fit(df)
    visualizer.poof()
    fig = visualizer.ax.get_figure();
    
    print("K={}, WCSS={:.2f}, Sil={:.2f}".format(k, wcss, sil))
Exemple #2
0
def showSilhouette():
    # Make 8 blobs dataset
    X, y = make_blobs(centers=8)
    # Instantiate the clustering model and visualizer
    model = MiniBatchKMeans(6)
    visualizer = SilhouetteVisualizer(model)

    visualizer.fit(X)  # Fit the training data to the visualizer
    visualizer.poof()  # Draw/show/poof the data
Exemple #3
0
def Silhouette_plot(x, from_k, to_k):
    sil_score = []
    for k in range(from_k, to_k + 1):
        #Instatiate the clustering model and visualizer
        m = KMeans(n_clusters=k)
        visualizer = SilhouetteVisualizer(m)
        visualizer.fit(x)
        #Draw/show/poof the data
        visualizer.poof()
        sil_score.append([visualizer.silhouette_score_.round(3), k])
    return sil_score
 def silhouette(matrix, k):
     """
     This function is also not explicitly used since it shows the decided 'k' is good or not.
     :param matrix: tf-idf matrix
     :param k: decided k (from elbow matrix)
     :return: show graph with all cluster's internal similarities and uniqueness with other clusters.
     """
     model_kmeans = KMeans(n_clusters=k, max_iter=200)
     silhouette = SilhouetteVisualizer(model_kmeans)
     silhouette.fit(matrix)
     silhouette.poof()
Exemple #5
0
# Clustering Evaluation Imports
from functools import partial

from sklearn.cluster import MiniBatchKMeans
from sklearn.datasets import make_blobs as sk_make_blobs

from yellowbrick.cluster import SilhouetteVisualizer

# Helpers for easy dataset creation
N_SAMPLES = 1000
N_FEATURES = 12
SHUFFLE = True

# Make blobs partial
make_blobs = partial(sk_make_blobs,
                     n_samples=N_SAMPLES,
                     n_features=N_FEATURES,
                     shuffle=SHUFFLE)

if __name__ == '__main__':
    # Make 8 blobs dataset
    X, y = make_blobs(centers=8)

    # Instantiate the clustering model and visualizer
    model = MiniBatchKMeans(6)
    visualizer = SilhouetteVisualizer(model)

    visualizer.fit(X)  # Fit the training data to the visualizer
    visualizer.poof(outpath="images/silhouette.png")  # Draw/show/poof the data
Exemple #6
0
    # print(sample_silhouette_value)
    return sils


ss = sil_score(x, 2, 5)
print(f'score={ss}')
print(f'optinum number of clusters ={max(ss)[1]}')
#
#Visualize Silhouette
#Instantiate the clustering model and visualizer
model = KMeans(n_clusters=3)
visualizer = SilhouetteVisualizer(model)
#fit the training data to visualizer
visualizer.fit(x)
#Draw/show/poof the data
visualizer.poof()
print(visualizer.silhouette_score_)  #near 1 is good


def Silhouette_plot(x, from_k, to_k):
    sil_score = []
    for k in range(from_k, to_k + 1):
        #Instatiate the clustering model and visualizer
        m = KMeans(n_clusters=k)
        visualizer = SilhouetteVisualizer(m)
        visualizer.fit(x)
        #Draw/show/poof the data
        visualizer.poof()
        sil_score.append([visualizer.silhouette_score_.round(3), k])
    return sil_score
                   axis=1)) / df_normalized.shape[0])
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

# Compute Silhoette Graph for different number of clusters to select
# optimal number of clusters
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer
for n_clusters in range(2, 9):
    model = SilhouetteVisualizer(KMeans(n_clusters))
    model.fit(df_normalized)
    model.poof()

# Utlize TSNE to visualize data
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import pylab as pl
num_of_clusters = 4
kmeans = KMeans(n_clusters=num_of_clusters)
kmeans.fit(df_normalized)

X = TSNE(n_components=2).fit_transform(df_normalized)

for i in range(0, X.shape[0]):
    if kmeans.labels_[i] == 0:
        c1 = pl.scatter(X[i, 0], X[i, 1], c='red')
    elif kmeans.labels_[i] == 1:
Exemple #8
0
plt.title('Gap Values by Cluster Count')
plt.savefig("Gap Values.png")
plt.show()

# =============================================================================

# =============================================================================
# Using the silhouette to find the optimal number of clusters

for n_clusters in range(4, 10):
    model = KMeans(n_clusters, init='k-means++')
    cluster_labels = model.fit_predict(X)
    visualizer = SilhouetteVisualizer(model)
    visualizer.fit(X)  # Fit the training data to the visualizer
    visualizer.show(outpath="BoW_Silhouette %d" % n_clusters)
    visualizer.poof()  # Draw/show/poof the data
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :",
          silhouette_avg)

# =============================================================================

# =============================================================================
# Clustering Using K-Means
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)

# reduce the features to 2D
reduced_features = pca.fit_transform(X)
# reduce the cluster centers to 2D
Exemple #9
0
plt.figure(dpi=150)
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.plot(range(2, 30), SSE)
plt.savefig("cluster_plot_tfidf")

km_tfidf = MiniBatchKMeans(n_clusters=16, random_state=4444)
nmf_tfidf_clusters2 = km_tfidf.fit_predict(nmf_tfidf_data)

#Silhouette Plot

visualiser_tfidf = SilhouetteVisualizer(MiniBatchKMeans(n_clusters=16),
                                        random_state=4444)
visualiser_tfidf.fit(nmf_tfidf_data)
visualiser_tfidf.poof()

#TSNE Plot

model_2 = TSNE(n_components=2, random_state=0, verbose=0)
low_data_2 = model_2.fit_transform(nmf_tfidf_data)

colors = ([
    'crimson', 'b', 'mediumseagreen', 'cyan', 'm', 'y', 'k', 'orange',
    'springgreen', 'deepskyblue', 'yellow', 'teal', 'navy', 'plum',
    'darkslategray', 'lightcoral', 'papayawhip'
])

plt.figure(dpi=150)

for i, c, label in zip(range(16), colors, list(range(16))):
Exemple #10
0
def silhouette_method(matrix, k):
    model_kmeans = KMeans(n_clusters=k, max_iter=200)
    silhouette = SilhouetteVisualizer(model_kmeans)
    silhouette.fit(matrix)
    silhouette.poof()
Exemple #11
0
    # Load the data from the files in the corpus
    for cat in categories:
        for name in os.listdir(os.path.join(path, cat)):
            files.append(os.path.join(path, cat, name))
            target.append(cat)

            with open(os.path.join(path, cat, name), 'r') as f:
                data.append(f.read())

    # Return the data bunch for use similar to the newsgroups example
    return Bunch(
        categories=categories,
        files=files,
        data=data,
        target=target,
    )

corpus = load_corpus('hobbies')
tfidf  = TfidfVectorizer(stop_words='english')
docs   = tfidf.fit_transform(corpus.data)

# Instantiate the clustering model and visualizer
visualizer = SilhouetteVisualizer(KMeans(n_clusters=6))
visualizer.fit(docs)
visualizer.poof()

# Instantiate the clustering model and visualizer
visualizer = KElbowVisualizer(KMeans(), metric='silhouette', k=[4,10])
visualizer.fit(docs)
visualizer.poof()