kmodes.fit(df_original[features].values)
  centroids, clusters, inertia = kmodes.centroids, kmodes.labels, kmodes.inertia
  score = adjusted_rand_score(df_original[target], clusters)
  if score > best_score:
    best_clusters = clusters
    best_centroids = centroids
    best_score = score 
    best_r = r
kmodes_clusters = best_clusters
best_score


# In[23]:


print(get_metrics(y_encoded, kmodes_clusters, df_OHE[features_OHE], alg='kmodes'))


# In[24]:


silhouette_score(df_LE[features_LE], kmodes_clusters, metric='hamming')


# For visualizing the clustering results we are going to plot the one-hot encoded data but painted with clustering assignation obtained from k-modes
# 

# In[25]:


centroids
plt.ylabel('2nd component')
plt.show()

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(x, _y, z, c=kproto.labels, s=50, cmap='viridis', alpha=0.5)
ax.set_xlabel('X1', fontsize=20)
ax.set_ylabel('X2', fontsize=20)
ax.set_zlabel('X3', fontsize=20)

true_labels = np.array(y_encoded[target])
pred = rename_labels(true_labels, kproto.labels)
print(kproto.labels.shape, true_labels.shape, pred.shape)

get_metrics(true_labels, pred, X=X_pca, alg='kproto')

# In[22]:

from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, accuracy_score

print('With PCA')
print(
    confusion_matrix(y_encoded[target],
                     rename_labels(y_encoded[target], kproto.labels)))
print(
    accuracy_score(y_encoded[target],
                   rename_labels(y_encoded[target], kproto.labels)))

print('\nWithout PCA')
print(
            cmap='viridis')

# centroids_pcs = get_components(model.centroids, n_components=0.9).values
centroids_pcs = PCA(n_components=n_comp).fit_transform(kme.centroids)

plt.scatter(centroids_pcs[:, 0], centroids_pcs[:, 1], marker='x', c='r', s=200)
plt.title('K-means')
plt.show()

# ### Clustering metrics

# In[122]:

from cluster.metrics import get_metrics

get_metrics(y, kme.labels, X_scaled_encoded, alg='kmeans')

# In[123]:

# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D

# fig = plt.figure(figsize=(10, 10))
# ax = fig.add_subplot(111, projection='3d')

# x = X_scaled_encoded_pca.values[:, 0]
# y = X_scaled_encoded_pca.values[:, 1]
# z = X_scaled_encoded_pca.values[:, 2]

# ax.scatter(x, y, z, c=clusters, s=50, cmap='viridis')
# ax.scatter(model.centroids[:,0], model.centroids[:,1], model.centroids[:,2], marker='x', c='r', s=200);
예제 #4
0
kp.fit(X_pca)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kp.labels, s=50, cmap='viridis')

centroids_pca = PCA(n_components=3).fit_transform(kp.centroids)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], marker='x', c='r', s=200)
plt.title('K-prototypes')
plt.show()

# **Metrics**

# In[20]:

from cluster.metrics import get_metrics

get_metrics(y, kp.labels, X_pca, alg='kproto')

# **Previous metrics**

# In[21]:

get_metrics(y, kp.labels, X, alg='kproto')

# ## 5. Use SOM to cluster

# In[26]:

import numpy as np
import matplotlib.pyplot as plt

import numpy as np
# In[89]:

# Different metrics
from cluster.metrics import get_metrics
aff = 'euclidean'
link = 'complete'
agglo = AgglomerativeClustering(affinity=aff,
                                compute_full_tree='auto',
                                linkage=link,
                                memory=None,
                                n_clusters=2,
                                pooling_func='deprecated')
agglo.fit(X_num_scaled)

print(get_metrics(y, agglo.labels_, X_num_scaled, alg='agglo'))

# **Confusion matrix Agglomerative**

# In[90]:

from sklearn.metrics import confusion_matrix, accuracy_score


def rename_labels(y_true, y_pred):
    from scipy.stats import mode
    mapping = {}
    for cat in set(y_true):
        predictions = y_pred[y_true == cat]
        predictions = [
            p for p in predictions if p not in list(mapping.values())
# Scatter plot with two principal components
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kp.labels, s=50, cmap='viridis')

centroids_pca = kp.centroids
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], marker='x', c='r', s=200)
plt.title('K-prototypes with principal components')
#plt.savefig(fname='numerical_pca2_kprototypes')
plt.show()

# **Metrics with PCA**

# In[28]:

from cluster.metrics import get_metrics

get_metrics(y, kp.labels, X_pca, alg='kproto')

# In[29]:

# Visaulization k-prototypes without PCA
from cluster.kprototypes import KPrototypes

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_num_scaled_pca = pca.fit_transform(X_num_scaled.values)

start_time = time.time()
kp = KPrototypes(n_clusters=2, cat_features=[], random_state=9)
kp.fit(X_num_scaled.values)
print("--- %s seconds ---" % (time.time() - start_time))