def elbow(X, range_clusters=range(2, 6)): inertias = [] ks = range_clusters for k in ks: model = KMeans(n_clusters=k, random_state=12) model.fit(X.values) # centroids_, clusters_, inertia_ = k_means(X_final.values, k=k) inertias.append(model.inertia) plt.plot(ks, inertias, '-o', color='black') plt.xlabel('number of clusters, k') plt.ylabel('inertia') plt.xticks(ks) plt.show()
def elbow(X, range_clusters=range(2, 6), alg='kmeans', cat_features=[], random_state=42): inertias = [] ks = range_clusters model = None for k in ks: if alg == 'kmeans': model = KMeans(n_clusters=k, random_state=random_state) elif alg == 'kmodes': model = KMeans(n_clusters=k, random_state=random_state) else: model = KPrototypes(n_clusters=k, cat_features=cat_features, random_state=random_state) model.fit(X.values) # centroids_, clusters_, inertia_ = k_means(X_final.values, k=k) inertias.append(model.inertia) plt.plot(ks, inertias, '-o', color='black') plt.xlabel('number of clusters, k') plt.ylabel('inertia') plt.title(alg) plt.xticks(ks) plt.show()
def make_plots(): """make scatter plots of each e-step and m-step of the kmeans algorithm uses old faithful dataset. """ X = np.loadtxt(os.path.join('data', 'faithful.txt')) X = normalize(X) init_means = np.array([[-1.75,1],[1.75,-1]]) clf = KMeans(init_centers=init_means, n_clusters=2) tmpdir = 'tmp' try: os.makedirs(tmpdir) except: pass basename = os.path.join(tmpdir,'faithful_kmeans') centers = init_means plotscatterhist(X, [], '{0}_{1}.png'.format(basename, 0), centers, lumped=True, dpi=200) for i in range(1,50,2): print 'iteration:', i, centers_old = centers.copy() labels, inertia = clf._e_step(X, centers) print 'inertia:', inertia plotscatterhist(X, labels, '{0}_{1}.png'.format(basename, i), centers, lumped=False, dpi=200) centers = clf._m_step(X, labels) plotscatterhist(X, labels, '{0}_{1}.png'.format(basename, i+1), centers, lumped=False, dpi=200) if np.sum((centers_old - centers) ** 2) < 1e-2: break print 'starting video encoding...', print 'done.'
def __init__(self, data, labels, k=None, radius=None, metric='euclidean'): # defined attributes self.data = data self.labels = labels self.k = k self.radius = radius self.metric = metric # computed attributes if k: cluster = KMeans(data, k).cluster() else: cluster = GMeans(data).cluster() self.k = cluster.k self.centroids = cluster.centroids self.weights = None # compute heuristic for radius if none provided if not radius: n, d = self.data.shape centroid = np.mean(data, axis=0) self.radius = np.max(distance.cdist(centroid[np.newaxis, :], data)) / (k**(1 / d))
def get_clusterer(n_clusters, cat_features=[], alg='kmeans', agglo_params=None, random_state=10): clusterer = None metric = '' # if len(cat_features) == 0: if alg == 'agglo': clusterer = AgglomerativeClustering(affinity=agglo_params[0], compute_full_tree='auto', linkage=agglo_params[1], memory=None, n_clusters=n_clusters, pooling_func='deprecated') metric = 'euclidean' elif alg == 'kmeans': clusterer = KMeans(n_clusters=n_clusters, random_state=random_state) metric = 'euclidean' elif alg == 'kmodes': clusterer = KModes(n_clusters=n_clusters, random_state=random_state) metric = 'manhattan' elif alg == 'fuzzy': clusterer = FuzzyCMeans(n_clusters=n_clusters, random_state=random_state) metric = 'euclidean' # else: elif alg == 'kproto': clusterer = KPrototypes(n_clusters=n_clusters, cat_features=cat_features, random_state=random_state) metric = 'manhattan' return clusterer, metric
# Perform K-means with one-hot encoding # # In[17]: from cluster.kmeans import KMeans best_clusters = None best_centroids = None best_r = None best_score = -9999 for r in range(25): kmeans = KMeans(n_clusters=3, random_state=r) kmeans.fit(df_features_OHE.values) centroids, clusters, inertia = kmeans.centroids, kmeans.labels, kmeans.inertia # centroids, clusters, inertia = k_means(df_.values, k=3, random_state=r) score = adjusted_rand_score(y_encoded, clusters) if score > best_score: best_clusters = clusters best_centroids = centroids best_score = score best_r = r kmeans_clusters = best_clusters best_score # In[18]:
# ## K-Means # # Implement your own K-Means (KM) algorithm and apply it to the data of the file. Note that you are not allowed to use sklearn library. # Choose the best distance metric: euclidean, manhattan and cosine. # In[120]: X_scaled_encoded.head() # In[121]: from cluster.kmeans import KMeans kme = KMeans(n_clusters=3, random_state=11) kme.fit(X_scaled_encoded.values) # y = splits['y'][rows_to_remove].values plt.scatter(X_scaled_encoded_pca.values[:, 0], X_scaled_encoded_pca.values[:, 1], c=kme.labels, s=50, cmap='viridis') # centroids_pcs = get_components(model.centroids, n_components=0.9).values centroids_pcs = PCA(n_components=n_comp).fit_transform(kme.centroids) plt.scatter(centroids_pcs[:, 0], centroids_pcs[:, 1], marker='x', c='r', s=200) plt.title('K-means')
from cluster.kmeans import KMeans import matplotlib.pyplot as plt from sklearn.datasets import make_blobs # Generate dataset X, y = make_blobs(centers=3, n_samples=500, random_state=1) # 需要重复运行10次k-means,否则就会以一定几率出现很差的情况 cls = KMeans(n_clusters=3, init="k-means++", n_init=10) cls.fit(X) group_colors = ['skyblue', 'coral', 'lightgreen'] colors = [group_colors[j] for j in cls.labels] fig, ax = plt.subplots(figsize=(4, 4)) ax.scatter(X[:, 0], X[:, 1], color=colors, alpha=0.5) ax.scatter(cls.cluster_centers[:, 0], cls.cluster_centers[:, 1], color=['blue', 'darkred', 'green'], marker='o', lw=2) ax.set_xlabel('$x_0$') ax.set_ylabel('$x_1$') plt.show()
# # Before analysing the K-means algorithm results fot the Breast Cancer W. dataset, it is important to know which is the # random state that performs a better clustering. To do this, we have chosen the adjusted rand score as a metric in order to know which random state gives the best result. # In[92]: # best random score: from cluster.kmeans import KMeans from sklearn.metrics import adjusted_rand_score best_clusters = None best_centroids = None best_r = None best_score = -9999 for r in range(20): kme = KMeans(n_clusters=2, random_state=r) kme.fit(X_num_scaled.values) score = adjusted_rand_score(y, kme.labels) if score > best_score: best_clusters = kme.labels best_centroids = kme.centroids best_score = score best_r = r fcm_clusters = best_clusters print('Best score:', best_score) print('Best random state value:', best_r) # In[93]: kme = KMeans(n_clusters=2, random_state=0) kme.fit(X_num_scaled.values)
from cluster.kmeans import KMeans from cluster.kmedoids import KMedoids from cluster.agglomerative_clustering import AgglomerativeClustering from cluster.dbscan import DBSCAN from cluster.metrics import purity from sklearn import datasets iris = datasets.load_iris() kmeans = KMeans(n_clusters=3, max_iter=100) kmeans.fit(iris.data) print(kmeans.labels_) print(purity(kmeans.labels_, iris.target)) kmedoids = KMedoids(n_clusters=3, max_iter=100) kmedoids.fit(iris.data) print(kmedoids.labels_) print(purity(kmedoids.labels_, iris.target)) single_agglo = AgglomerativeClustering( n_clusters=3, linkage=AgglomerativeClustering.SINGLE_LINKAGE, affinity=AgglomerativeClustering.EUCLIDEAN_DISTANCE) single_agglo.fit(iris.data) print(single_agglo.labels_) print(purity(single_agglo.labels_, iris.target)) complete_agglo = AgglomerativeClustering( n_clusters=3, linkage=AgglomerativeClustering.COMPLETE_LINKAGE, affinity=AgglomerativeClustering.EUCLIDEAN_DISTANCE)
def silhouette(X, X_pca, range_clusters=range(2, 5)): """ Function provided by sklearn with some modifications. Reference: https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html """ for n_clusters in range_clusters: # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = KMeans(n_clusters=n_clusters, random_state=10) clusterer.fit(X.values) cluster_labels = clusterer.labels # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X_pca.values[:, 0], X_pca.values[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') # Labeling the clusters centers = clusterer.centroids # Draw white circles at cluster centers ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle( ("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show()