def KMeans_std(): lst = [] space = np.linspace(0.01, 10, 10) samples = 1000 centers = 4 dim = 2 for i in space: x, y = make_blobs(n_samples=samples, centers=centers, n_features=dim, random_state=1, cluster_std=i) _y = cluster.KMeans(x, 4) acc = sklearn.metrics.homogeneity_score(y, _y) lst.append(acc) plt.plot(space, np.array(lst), 'r') plt.xlabel('Standart deviation') plt.ylabel('Accuracy') plt.legend() plt.title('S1amples: {0} Centers: {1} Dimensional {2}'.format( samples, centers, dim)) plt.grid(color='#dddddd', linestyle='-', linewidth=1) plt.show()
def split_words_bykeam(texteg,wn): ret = np.where(texteg) pt = np.array( zip(ret[1],ret[0] ) ) ap = cluster.KMeans(n_clusters=wn).fit(pt) words = [[]]* wn for k in range(wn): labelpt = pt[np.where(ap.labels_ == k)[0]] tmp = np.zeros(texteg.shape) tmp[labelpt[:,1],labelpt[:,0]] = 1 words[k] = tmp > 0 return words
def KMeans_image(img): s = img.shape img = img.reshape((img.shape[0] * img.shape[1], 3)) std = np.std(img) y = cluster.KMeans(img, 5) colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255)] for i, v in enumerate(y): img[i] = colors[v] img = img.reshape(s) return img
def fit(self, X, n_iter=10, min_covar=1e-3, thresh=1e-2, params='wmc', init_params='wmc'): X = np.asanyarray(X) if hasattr(self, 'n_features') and self.n_features != X.shape[1]: raise ValueError('Unexpected number of dimensions, got %s but ' 'expected %s' % (X.shape[1], self.n_features)) self.n_features = X.shape[1] if 'm' in init_params: self._means = cluster.KMeans( k=self._n_states).fit(X).cluster_centers_ elif not hasattr(self, 'means'): self._means = np.zeros((self.n_states, self.n_features)) if 'w' in init_params or not hasattr(self, 'weights'): self.weights = np.tile(1.0 / self._n_states, self._n_states) if 'c' in init_params: cv = np.cov(X.T) if not cv.shape: cv.shape = (1, 1) self._covars = _distribute_covar_matrix_to_match_cvtype( cv, self._cvtype, self._n_states) elif not hasattr(self, 'covars'): self.covars = _distribute_covar_matrix_to_match_cvtype( np.eye(self.n_features), self.cvtype, self.n_states) logprob = [] for i in xrange(n_iter): curr_logprob, posteriors = self.eval(X) logprob.append(curr_logprob.sum()) if i > 0 and abs(logprob[-1] - logprob[-2]) < thresh: break self._do_mstep(X, posteriors, params, min_covar) return self
In [82]: from sklearn import cluster, datasets In [83]: iris = datasets.load_iris() In [84]: k_means = cluster.KMeans(k=3) In [85]: k_means.fit(iris.data) Out[85]: KMeans(copy_x=True, init='k-means++', k=3, max_iter=300, n_init=10, n_jobs=1, precompute_distances=True, random_state=<mtrand.RandomState object at 0x7f4d860642d0>, tol=0.0001, verbose=0) In [86]: print k_means.labels_[::10] [1 1 1 1 1 2 2 2 2 2 0 0 0 0 0] In [87]: print iris.target[::10]
def has_converged(mu, oldmu): return (set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu]) def find_centers(X, K): # Initialize to K random centers oldmu = random.sample(X, K) mu = random.sample(X, K) while not has_converged(mu, oldmu): oldmu = mu # Assign all points in X to clusters clusters = cluster_points(X, mu) # Reevaluate centers mu = reevaluate_centers(oldmu, clusters) return(mu, clusters) import random def init_board(N): X = np.array([(random.uniform(-1, 1), random.uniform(-1, 1)) for i in range(N)]) return X def init_board_gauss(N, k): n = float(N)/k X = [] for i in range(k): c = (random.uniform(-1, 1), random.uniform(-1, 1)) s = random.uniform(0.05,0.5) x = [] while len(x) < n: a, b = np.array([np.random.normal(c[0], s), np.random.normal(c[1], s)]) # Continue drawing points from the distribution in the range [-1,1] if abs(a) < 1 and abs(b) < 1: x.append([a,b]) X.extend(x) X = np.array(X)[:N] return X ################################### # https://datasciencelab.wordpress.com/2013/12/27/finding-the-k-in-k-means-clustering/ def Wk(mu, clusters): K = len(mu) return sum([np.linalg.norm(mu[i]-c)**2/(2*len(c)) \ for i in range(K) for c in clusters[i]]) def bounding_box(X): xmin, xmax = min(X,key=lambda a:a[0])[0], max(X,key=lambda a:a[0])[0] ymin, ymax = min(X,key=lambda a:a[1])[1], max(X,key=lambda a:a[1])[1] return (xmin,xmax), (ymin,ymax) def gap_statistic(X): (xmin,xmax), (ymin,ymax) = bounding_box(X) # Dispersion for real distribution ks = range(1,10) Wks = zeros(len(ks)) Wkbs = zeros(len(ks)) sk = zeros(len(ks)) for indk, k in enumerate(ks): mu, clusters = find_centers(X,k) Wks[indk] = np.log(Wk(mu, clusters)) # Create B reference datasets B = 10 BWkbs = zeros(B) for i in range(B): Xb = [] for n in range(len(X)): Xb.append([random.uniform(xmin,xmax),random.uniform(ymin,ymax)]) Xb = np.array(Xb) mu, clusters = find_centers(Xb,k) BWkbs[i] = np.log(Wk(mu, clusters)) Wkbs[indk] = sum(BWkbs)/B sk[indk] = np.sqrt(sum((BWkbs-Wkbs[indk])**2)/B) sk = sk*np.sqrt(1+1/B) return(ks, Wks, Wkbs, sk) X = init_board_gauss(200,3) ks, logWks, logWkbs, sk = gap_statistic(X) #http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html ###################################################################################################################### -------------------------------------- #http://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans from sklearn import cluster from scipy.spatial import distance import sklearn.datasets from sklearn.preprocessing import StandardScaler import numpy as np def compute_bic(kmeans,X): #Computes the BIC metric for a given clusters. Parameters: #kmeans: List of clustering object from scikit learn #X : multidimension np array of data points #Returns: BIC value ##################### # assign centers and labels centers = [kmeans.cluster_centers_] labels = kmeans.labels_ #number of clusters m = kmeans.n_clusters # size of the clusters n = np.bincount(labels) #size of data set N, d = X.shape #compute variance for all clusters beforehand cl_var = (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2) for i in range(m)]) const_term = 0.5 * m * np.log(N) * (d+1) BIC = np.sum([n[i]*np.log(n[i]) - n[i]*np.log(N) - ((n[i] * d)/2)*np.log(2*np.pi*cl_var) - ((n[i] - 1)*d/2) for i in range(m)]) - const_term return(BIC) #IRIS DATA iris = sklearn.datasets.load_iris() X = iris.data[:, :4] # extract only the features #Xs = StandardScaler().fit_transform(X) Y = iris.target ks = range(1,10) # run 9 times kmeans and save each result in the KMeans object KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(X) for i in ks] # now run for each cluster the BIC computation #a=df1.as_matrix() BIC = [compute_bic(kmeansi,X) for kmeansi in KMeans] print BIC #[-901.8088330799194, -562.67814893720902, -442.4179569307467, -401.31661808222532, -373.70396994638168, -367.27568113462917, -369.13543294596866, -351.7636856213748, -360.97885983416268] plt.plot(ks,BIC,'r-o') plt.title("iris data (cluster vs BIC)") plt.xlabel("# clusters") plt.ylabel("# BIC") plt.show() ####################################################################################################################################### -------------------------------- # https://www.linkedin.com/pulse/finding-k-k-means-clustering-jaganadh-gopinadhan import pylab as plt import numpy as np from scipy.spatial.distance import cdist, pdist from sklearn.cluster import KMeans from sklearn.datasets import load_iris iris = load_iris() k = range(1,11) clusters = [cluster.KMeans(n_clusters = c,init = 'k-means++').fit(iris.data) for c in k] centr_lst = [cc.cluster_centers_ for cc in clusters] k_distance = [cdist(iris.data, cent, 'euclidean') for cent in centr_lst] clust_indx = [np.argmin(kd,axis=1) for kd in k_distance] distances = [np.min(kd,axis=1) for kd in k_distance] avg_within = [np.sum(dist)/iris.data.shape[0] for dist in distances] with_in_sum_square = [np.sum(dist ** 2) for dist in distances] to_sum_square = np.sum(pdist(iris.data) ** 2)/iris.data.shape[0] bet_sum_square = to_sum_square - with_in_sum_square kidx = 2 fig = plt.figure() ax = fig.add_subplot(111) ax.plot(k, avg_within, 'g*-') ax.plot(k[kidx], avg_within[kidx], marker='o', markersize=12, markeredgewidth=2, markeredgecolor='r', markerfacecolor='None') plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Average within-cluster sum of squares') plt.title('Elbow for KMeans clustering (IRIS Data)') ######################################################################################################################################## --------------------------------------------- # http://stanford.edu/~cpiech/cs221/handouts/kmeans.html # Function: K Means # ------------- # K-Means is an algorithm that takes in a dataset and a constant # k and returns k centroids (which define clusters of data in the # dataset which are similar to one another). def kmeans(dataSet, k): # Initialize centroids randomly numFeatures = dataSet.getNumFeatures() centroids = getRandomCentroids(numFeatures, k) # Initialize book keeping vars. iterations = 0 oldCentroids = None # Run the main k-means algorithm while not shouldStop(oldCentroids, centroids, iterations): # Save old centroids for convergence test. Book keeping. oldCentroids = centroids iterations += 1 # Assign labels to each datapoint based on centroids labels = getLabels(dataSet, centroids) # Assign centroids based on datapoint labels centroids = getCentroids(dataSet, labels, k) # We can get the labels too by calling getLabels(dataSet, centroids) return centroids # Function: Should Stop # ------------- # Returns True or False if k-means is done. K-means terminates either # because it has run a maximum number of iterations OR the centroids # stop changing. def shouldStop(oldCentroids, centroids, iterations): if iterations > MAX_ITERATIONS: return True return oldCentroids == centroids # Function: Get Labels # ------------- # Returns a label for each piece of data in the dataset. def getLabels(dataSet, centroids): # For each element in the dataset, chose the closest centroid. # Make that centroid the element's label. # Function: Get Centroids # ------------- # Returns k random centroids, each of dimension n. def getCentroids(dataSet, labels, k): # Each centroid is the geometric mean of the points that # have that centroid's label. Important: If a centroid is empty (no points have # that centroid's label) you should randomly re-initialize it. ####################################################################################################################### ####################################################################################################################### ## https://gist.github.com/jaganadhg/ddbf0956a7921b83ceef90b8a81dfaee """ Author : Jaganadh Gopinadhan Licence : Apahce 2 e-mail jaganadhg at gmail dot com """ import scipy from sklearn.cluster import KMeans from sklearn.datasets import load_iris import pandas as pd class TWHGapStat(object): """ Implementation of Gap Statistic from Tibshirani, Walther, Hastie to determine the inherent number of clusters in a dataset with k-means clustering. Ref Paper : https://web.stanford.edu/~hastie/Papers/gap.pdf """ def generate_random_data(self, X): """ Populate reference data. Parameters ---------- X : Numpy Array The base data from which random sample has to be generated Returns ------- reference : Numpy Array Reference data generated using the Numpy/Scipy random utiity . NUmber of diamensions in the data will be as same as the base dataset. """ reference = scipy.random.random_sample(size=(X.shape[0], X.shape[1])) return reference def _fit_cluster(self,X, n_cluster, n_iter=5): """ Fit cluster on reference data and return inertia mean. Parameters ---------- X : numpy array The base data n_cluster : int The number of clusters to form n_iter : int, default = 5 number iterative lustering experiments has to be perfromed in the data. If the data is large keep it less than 5, so that the run time will be less. Returns ------- mean_nertia : float Returns the mean intertia value. """ iterations = range(1, n_iter + 1) ref_inertias = pd.Series(index=iterations) for iteration in iterations: clusterer = KMeans(n_clusters=n_cluster, n_init=3, n_jobs=-1) # If you are using Windows server n_jobs = -1 will be dangerous. So the # value should be set to max cores - 3 . If we use all the cores available # in Windows server sklearn tends to throw memory error clusterer.fit(X) ref_inertias[iteration] = clusterer.inertia_ mean_nertia = ref_inertias.mean() return mean_nertia def fit(self,X,max_k): """ Compute Gap Statistics Parameters ---------- X : numpy array The base data max_k :int Maximum value to which we are going to test the 'k' in k-means algorithmn Returns ------- gap_stat : Pandas Series For eack k in max_k range gap stat value is returned as a Pandas Sereies. Index is K and valuess correspondes to gap stattistics for each K """ k_range = range(1,max_k + 1) gap_stat = pd.Series(index=k_range) ref_data = self.generate_random_data(X) for k in k_range: base_clusterer = KMeans(n_clusters=k,n_init = 3, n_jobs = -1) base_clusterer.fit(X) ref_intertia = self._fit_cluster(ref_data,k) cur_gap = scipy.log(ref_intertia - base_clusterer.inertia_) gap_stat[k] = cur_gap return gap_stat if __name__ == "__main__": iris = load_iris() X = iris.data gap_stat = TWHGapStat() gs = gap_stat.fit(X,5) print gs
import pandas as pd import matplotlib.pyplot as plt import cluster as clt import timeit start = timeit.default_timer() dataset = pd.read_csv('/home/neo/Desktop/kmeans/dataset.csv') dataset = dataset.values wcss = [] for i in range(1, 10): kmeans = clt.KMeans(n_clusters=i, shift_tolerance=0.02, thread_capacity=4) kmeans.fit(dataset) wcss.append(kmeans.inertia) plt.plot(range(1, 10), wcss) plt.show() kmeans = clt.KMeans(n_clusters=2, shift_tolerance=0.005, thread_capacity=4) kmeans.fit_showDetails(dataset) plt.scatter([x[0] for x in kmeans.cluster[0]], [x[1] for x in kmeans.cluster[0]], s=2, color='blue') plt.scatter([x[0] for x in kmeans.cluster[1]], [x[1] for x in kmeans.cluster[1]], s=2, color='red') plt.scatter([x[0] for x in kmeans.cluster[2]],
def sort_split_word(texteg,wn): total_area = texteg.sum()*1.0 labels = measure.label(texteg,neighbors=8) rgps = measure.regionprops(labels) sort_lab = np.zeros((len(rgps),2)) for i in range(len(rgps)): lab = rgps[i] rt = lab.area /total_area sort_lab[i] = [rt,lab.label] label_sort_area = np.argsort(-sort_lab[:,0]) words = [] bwhole = 0 for i in range(len(label_sort_area)): ilabel = int( sort_lab[label_sort_area[i]][1] ) hword = ( labels == ilabel ) if i == 0 : word = labels == ilabel rect = getMaxRect(word) words.extend([word]) if rect[3]- rect[2] > 18 : # whole dont'break; bwhole = 1 print 'whole body' #break if rect[3]- rect[2] > 30 and sort_lab[label_sort_area[i]][0] > 0.85: # complete whole body bwhole = 2 continue match = [] for k in range(len(words)): tpwd = words[k] binword,dis = is_maybe_inword(ilabel,labels,tpwd,bwhole) if binword ==0 and len(words) < wn and bwhole < 2 : word = labels == ilabel words.extend([word]) break elif binword == 1 and dis == 0 : words[k] = tpwd + ( labels == ilabel ) break elif binword == 1 and dis > 0 : match.extend([[dis,k ]]) match = np.array(match) if len(match) == 0: continue if len(match) == 1: mk = int( match[0][1] ) words[mk] = words[mk] + ( labels == ilabel ) else : mk = np.argsort(match[:,0])[0] words[mk] = words[mk] + ( labels == ilabel ) if bwhole > 0 : if len(words) ==1 : wholeword = words[0] else : wholeword = words[0] + words[1] ret = np.where(wholeword) pt = np.array( zip(ret[1],ret[0] ) ) ap = cluster.KMeans(n_clusters=wn).fit(pt) words = [[]]* wn for k in range(wn): labelpt = pt[np.where(ap.labels_ == k)[0]] tmp = np.zeros(wholeword.shape) tmp[labelpt[:,1],labelpt[:,0]] = 1 words[k] = tmp > 0 return words