kmodes.fit(df_original[features].values) centroids, clusters, inertia = kmodes.centroids, kmodes.labels, kmodes.inertia score = adjusted_rand_score(df_original[target], clusters) if score > best_score: best_clusters = clusters best_centroids = centroids best_score = score best_r = r kmodes_clusters = best_clusters best_score # In[23]: print(get_metrics(y_encoded, kmodes_clusters, df_OHE[features_OHE], alg='kmodes')) # In[24]: silhouette_score(df_LE[features_LE], kmodes_clusters, metric='hamming') # For visualizing the clustering results we are going to plot the one-hot encoded data but painted with clustering assignation obtained from k-modes # # In[25]: centroids
plt.ylabel('2nd component') plt.show() fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111, projection='3d') ax.scatter(x, _y, z, c=kproto.labels, s=50, cmap='viridis', alpha=0.5) ax.set_xlabel('X1', fontsize=20) ax.set_ylabel('X2', fontsize=20) ax.set_zlabel('X3', fontsize=20) true_labels = np.array(y_encoded[target]) pred = rename_labels(true_labels, kproto.labels) print(kproto.labels.shape, true_labels.shape, pred.shape) get_metrics(true_labels, pred, X=X_pca, alg='kproto') # In[22]: from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, accuracy_score print('With PCA') print( confusion_matrix(y_encoded[target], rename_labels(y_encoded[target], kproto.labels))) print( accuracy_score(y_encoded[target], rename_labels(y_encoded[target], kproto.labels))) print('\nWithout PCA') print(
cmap='viridis') # centroids_pcs = get_components(model.centroids, n_components=0.9).values centroids_pcs = PCA(n_components=n_comp).fit_transform(kme.centroids) plt.scatter(centroids_pcs[:, 0], centroids_pcs[:, 1], marker='x', c='r', s=200) plt.title('K-means') plt.show() # ### Clustering metrics # In[122]: from cluster.metrics import get_metrics get_metrics(y, kme.labels, X_scaled_encoded, alg='kmeans') # In[123]: # import matplotlib.pyplot as plt # from mpl_toolkits.mplot3d import Axes3D # fig = plt.figure(figsize=(10, 10)) # ax = fig.add_subplot(111, projection='3d') # x = X_scaled_encoded_pca.values[:, 0] # y = X_scaled_encoded_pca.values[:, 1] # z = X_scaled_encoded_pca.values[:, 2] # ax.scatter(x, y, z, c=clusters, s=50, cmap='viridis') # ax.scatter(model.centroids[:,0], model.centroids[:,1], model.centroids[:,2], marker='x', c='r', s=200);
kp.fit(X_pca) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kp.labels, s=50, cmap='viridis') centroids_pca = PCA(n_components=3).fit_transform(kp.centroids) plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], marker='x', c='r', s=200) plt.title('K-prototypes') plt.show() # **Metrics** # In[20]: from cluster.metrics import get_metrics get_metrics(y, kp.labels, X_pca, alg='kproto') # **Previous metrics** # In[21]: get_metrics(y, kp.labels, X, alg='kproto') # ## 5. Use SOM to cluster # In[26]: import numpy as np import matplotlib.pyplot as plt import numpy as np
# In[89]: # Different metrics from cluster.metrics import get_metrics aff = 'euclidean' link = 'complete' agglo = AgglomerativeClustering(affinity=aff, compute_full_tree='auto', linkage=link, memory=None, n_clusters=2, pooling_func='deprecated') agglo.fit(X_num_scaled) print(get_metrics(y, agglo.labels_, X_num_scaled, alg='agglo')) # **Confusion matrix Agglomerative** # In[90]: from sklearn.metrics import confusion_matrix, accuracy_score def rename_labels(y_true, y_pred): from scipy.stats import mode mapping = {} for cat in set(y_true): predictions = y_pred[y_true == cat] predictions = [ p for p in predictions if p not in list(mapping.values())
# Scatter plot with two principal components plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kp.labels, s=50, cmap='viridis') centroids_pca = kp.centroids plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], marker='x', c='r', s=200) plt.title('K-prototypes with principal components') #plt.savefig(fname='numerical_pca2_kprototypes') plt.show() # **Metrics with PCA** # In[28]: from cluster.metrics import get_metrics get_metrics(y, kp.labels, X_pca, alg='kproto') # In[29]: # Visaulization k-prototypes without PCA from cluster.kprototypes import KPrototypes from sklearn.decomposition import PCA pca = PCA(n_components=2) X_num_scaled_pca = pca.fit_transform(X_num_scaled.values) start_time = time.time() kp = KPrototypes(n_clusters=2, cat_features=[], random_state=9) kp.fit(X_num_scaled.values) print("--- %s seconds ---" % (time.time() - start_time))