def cluster_images(images, cluster_count): """ Cluster images into specified number of clusters using k-means """ shapes = set([image.data.shape for image in images]) if len(shapes) > 1: raise ValueError("Images should have the same dimensions") else: image_shape = list(shapes)[0] # Do clustering vectors = [ image.to_vector() for image in images ] kmeans = KMeans(n_clusters=cluster_count) kmeans.fit(vectors) centroids = kmeans.cluster_centers_ # Return Image instances clustered_images = [ Image(centroid.reshape(image_shape)) for centroid in centroids ] return clustered_images
def k_means(data, n_clusters=5): ''' sklearn based KMeans method. Inputs: - data: training data set containing the events to be processed (matrix [m,n]) - n_clusters: number of clusters to be used Outputs: (Z, centroids, kmeans) array containing to which formed cluster every event in the training data set belongs to (array [m]), the centroids or mean values (matrix [m, n_clusters]) and the resulting kmeans object ''' kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=20) # testing kmedians and fuzzy kmeans... #kmeans = KMedians(k=n_clusters) # kmeans = FuzzyKMeans(k=7, m=2) kmeans.fit(data) # Obtain labels for each point in mesh. Use last trained model. #Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) #Z = kmeans.predict(data) Z = kmeans.labels_ # obtain centroids centroids = kmeans.cluster_centers_ inertia = kmeans.inertia_ print('Sum of distances of events to their closest cluster center: ' + str(inertia)) return Z, centroids, kmeans
def fca(data,n_clusters=3): # st = time.time() ward = Ward(n_clusters=n_clusters).fit(data) label = ward.labels_ # print("Elapsed time: ", time.time() - st) # print("Number of points: ", label.size) #print label centroids=[] #reduced_data = PCA(n_components=).fit_transform(data) kmeans = KMeans(init='k-means++', n_clusters=1, n_init=10) for i in range(n_clusters): c_data=[] c_index=[] for j in range(label.size): if label[j] == i: #print 'i=',i,'j=',j,'data=',data c_data.append(data[j]) c_index.append(j) kmeans.fit(c_data) centroid = kmeans.cluster_centers_[0] #print c_data #print centroid c_dist=[] for xdata in c_data: c_dist.append(sum([(x-y)**2 for x,y in zip(xdata,centroid)])) c_i=c_dist.index(max(c_dist)) centroids.append(c_index[c_i]) return centroids
def calculate_wcss(data): wcss = [] for n in range(2, 21): kmeans = KMeans(n_clusters=n) kmeans.fit(X=data) wcss.append(kmeans.inertia_) return wcss
def visualize_clusters(data, target, problem, k): ''' pca = PCA(n_components=2).fit(data) pca_2d = pca.transform(data) # now visualize classified data in new projected space pl.figure('Reference Plot ' + problem) pl.scatter(pca_2d[:, 0], pca_2d[:, 1], c=['black']) kmeans = KMeans(n_clusters=3) kmeans.fit(data) pl.figure('K-means with 2 clusters ' + problem) pl.scatter(pca_2d[:, 0], pca_2d[:, 1], c=['navy', 'darkorange', 'green'], alpha=0.4) pl.legend() pl.show() ''' reduced_data = PCA(n_components=2).fit_transform(data) kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure(1) pl.clf() pl.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=pl.cm.Paired, aspect='auto', origin='lower') pl.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # Plot the centroids as a white X centroids = kmeans.cluster_centers_ pl.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) pl.title('K-means clustering on the ' + problem + ' dataset (PCA-reduced data)\n' 'Centroids are marked with white cross') pl.xlim(x_min, x_max) pl.ylim(y_min, y_max) pl.xticks(()) pl.yticks(()) pl.show()
def MiniBatchKMeans(self, X, batch=10000): print("in fit method", X.shape, self.k) kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.k, batch_size=batch) kmeans.fit(X) centers = kmeans.cluster_centers_ clusters = kmeans.labels_ print("shape of centers is ", centers.shape) return centers
def calcKMeans(self, data): if len(data) == 0: return None kmeans = KMeans(n_clusters=self.samplesize) centoids = {} for key, val in data.items(): kmeans.fit(np.squeeze(val)) centoids[key] = kmeans.labels_ return centoids
def cluster(x, y, n): # data generation kmeans = KMeans(n_clusters=n) kmeans.fit(x) y_kmeans = kmeans.predict(x) plt.scatter(x[:, 0], y[:, 0], c=y_kmeans, s=50, cmap='viridis') plt.show() centers = kmeans.cluster_centers_ print(centers) plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5) plt.show()
def calcKMeans(self, data): if len(data) == 0: return None kmeans = KMeans(n_clusters=self.samplesize) histData = {} for key, val in data.items(): kmeans.fit(np.squeeze(val)) # the histogram of the data cnts, _ = np.histogram(kmeans.labels_, self.samplesize) histData[key] = cnts return histData
def clusteringKMeans(XVal): kmeans = KMeans(n_clusters=4, random_state=0) kmeans.fit(XVal) centroids = kmeans.cluster_centers_ labels = kmeans.labels_ colors = ["g.", "r.", "y.", "c.", "b."] for i in range(len(XVal)): plt.plot(XVal[i][0], XVal[i][1], colors[labels[i]], markersize=10) plt.scatter(centroids[:, 0], centroids[:, 1], marker="x", s=150, linewidth=5, zorder=10) return kmeans, plt, labels
plt.show() # In[9]: X = np.array([x, y, z, a]) # In[10]: kmeans =KMeans(n_clusters= 4) # In[11]: kmeans.fit(X) # In[12]: #Center marker of the clusters centroids = kmeans.cluster_centers_ # In[13]: labels = kmeans.labels_ # In[14]:
# For each K, compare the difference in error between current K and K-1 vs # K and K+1 to determine where the most significant improvement in error rates are for i in range(1, len(avgWithinSS[name]) - 1): if ratio2 > ratio: k = i ratio = ratio2 diff = avgWithinSS[name][i - 1] - avgWithinSS[name][i] diff2 = avgWithinSS[name][i] - avgWithinSS[name][i + 1] ratio2 = diff - diff2 # k-means clustering by PC9 volume # Re-Run K-Means clustering algorithm for the specific K value as determined by the Elbow Test list_k = [i for i in range(k)] kmeans = KMeans(n_clusters=k) kmeans.fit(X) # Plot the results of the K-Means algorithm mglearn.discrete_scatter(X[:, 0], X[:, 1], kmeans.labels_, markers='o') mglearn.discrete_scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], list_k, markers='^', markeredgewidth=2) # Store the results of the algorithm back within the original data set PC9_Shipment_Qty['PC9_Vol_Cluster_Unsorted'] = kmeans.labels_ # Order Volume Clusters by Avg. Shipment Vol to order cluster from Smallest to Largest Volume_Cluster_Definitions = PC9_Shipment_Qty.groupby(['PC9','PC9_Vol_Cluster_Unsorted']).sum().groupby('PC9_Vol_Cluster_Unsorted').mean() Volume_Cluster_Definitions = Volume_Cluster_Definitions.sort_values(by=['PC9_Shipped_Qty']) Volume_Cluster_Definitions['Unsorted_Cluster'] = Volume_Cluster_Definitions.index.get_values() Sorted_Grouping_List = [i for i in range(len(Volume_Cluster_Definitions.index))] Volume_Cluster_Definitions['Sorted_Cluster'] = Sorted_Grouping_List
import sys import numpy as np from scipy.cluster.hierarchy import dendrogram, linkage, leaves_list, optimal_leaf_ordering import matplotlib.pyplot as plt import pandas as pd plt.style.use('ggplot') from sklearn.cluster import KMeans from scipy.cluster.vq import kmeans,vq df=pd.read_table(sys.argv[1], sep = "\t", header = 0, index_col = 0).loc[:, ("CFU", "poly")] array = df.values col_names = df.columns.values.tolist() #print(df) Z = linkage(array, 'ward') kmeans = KMeans(n_clusters = 4) kmeans.fit(Z) y_means = kmeans.predict(Z) fig, ax = plt.subplots() plt.scatter(Z[:, 0], Z[:, 1], c = y_means, s=50, cmap = "viridis") centers = kmeans.cluster_centers_ plt.scatter(centers[:, 0], centers[:, 1], c = "black", s=200, alpha = 0.5) fig.savefig("kmeans.png") plt.close(fig) #kmeans = scipy.cluster.vq.kmeans(Z, 2) #centroids, _ = kmeans(Z, 2) #idx, _ = vq(Z, centroids) #plot(data[idx==0,0], data[idx==0,1], "ob", # data[idx==1,0],data[idx==1,1], "or")
print('-' * 80) print("Benchmarking with several k values: ") rang = 10 for k in range(max(2, num_clusters - rang), min(len(instance_names) - 1, num_clusters + rang)): bench_k_means(KMeans(init='k-means++', n_clusters=k, n_init=10), name="k-means++ (k=" + str(k) + ")", data=features_data) print('-' * 80) # Prepare the data for visualization in 2D-plot. reduced_data = PCA( n_components=2, random_state=np.random.randint(1)).fit_transform(features_data) kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .005 # point in the mesh [x_min, x_max]x[y_min, y_max]. # execute: python3 cluster-features-alberto.py seed file n_clusters # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot