#%% Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from gap_statistic import OptimalK
#%% Importing the dataset
dataset = pd.read_csv('clustering/pcs.csv', index_col=0)
X = dataset.values
names = dataset.index
#%% Using the elbow method to find the optimal number of clusters
wcss = []
for i in range(1, 20):
    print(i)
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 14)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 20), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
#%% Gap Statistic
optimalK = OptimalK(n_jobs=4, parallel_backend='joblib')
n_clusters = optimalK(X, cluster_array=np.arange(1, 50))
test = optimalK.gap_df
optimalK.plot_results()
#%% Training the K-Means model on the dataset
best_model = []
best_wcss = 160000
Esempio n. 2
0
# Make some test data
#X, y = make_blobs(n_samples=int(1e5), n_features=2, centers=3, random_state=25)
#print('Data shape: ', X.shape)
#print(X, type(X))
#X = np.array([[100., 1.], [200.,1.],[220.,1.],[230.,1.], [500.,1.], [600.,1.]])
X = np.array([[100.], [200.], [220.], [230.], [580.], [600.]])
#X = np.array([[100.],[200.],[300.],[400.], [500.], [600.]])
#X = np.array([[100.],[180.],[300.],[410.], [500.], [610.]])
print(X, type(X))
# Call OptimalK to determine best number of clusters
print('Calculating optimal number of clusters')
n_clusters = optimalK(X, cluster_array=np.arange(1, 6), n_refs=100)
print('Optimal clusters: ', n_clusters)
print('Diff', optimalK.gap_df["diff"])
#sys.exit()
optimalK.plot_results()
# Plot some results
plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(
    optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
    optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value,
    s=250,
    c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()

# Now that we have the optimal clusters, n, we build our own KMeans model...
km = KMeans(n_clusters)