def chooseK(data_X,data_Y): best_rd=0 best_k=0 for k in range(1,100): clf = KMeansClassifier(k) clf.fit(data_X) labels = clf._labels a=0 b=0 c=0 d=0 for j in range(len(data_Y)-1,0,-1): for i in range(0,j): if data_Y[i]==data_Y[j] and labels[i]==labels[j]: a+=1 elif data_Y[i]==data_Y[j] and labels[i]!=labels[j]: b+=1 elif data_Y[i]!=data_Y[j] and labels[i]==labels[j]: c+=1 else: d+=1 rd=2*(a+d)/(len(data_Y)*(len(data_Y)-1)) print("rd ",rd,"k ",k) if rd>best_rd: best_rd=rd best_k=k return best_rd,best_k
def kmeans_classification_builder(centroid_func, x_train, x_test, y_train, y_test): # plot some train data N = 25 l = int(np.ceil(np.sqrt(N))) im = np.zeros((10 * l, 10 * l)) for m in range(l): for n in range(l): if (m * l + n < N): im[10 * m:10 * m + 8, 10 * n:10 * n + 8] = x_train[m * l + n].reshape([8, 8]) plt.imsave('plots/digits.png', im, cmap='Greys') n_cluster = 10 classifier = KMeansClassifier(n_cluster=n_cluster, max_iter=100, e=1e-6) classifier.fit(x_train, y_train, centroid_func) y_hat_test = classifier.predict(x_test) # print(y_test[0]) # print(len(y_test)) assert y_hat_test.shape == y_test.shape, \ 'y_hat_test and y_test should have same shape' print( '[*] Prediction accuracy of K-means classifier with {} cluster is {}'. format(n_cluster, np.mean(y_hat_test == y_test)))
def getkmeansresult(k, data_X, data_od): clf = KMeansClassifier(k) clf.fit(data_X, data_od) cents = clf._centroids labels = clf._labels flag = True for i in range(k): index = np.nonzero(labels == i)[0] x0 = data_X[index, 0] x1 = data_X[index, 1] y_i = i tmpsum = 0 for j in range(len(x0)): tmpl = np.math.sqrt( np.power(cents[i, 0] - x0[j], 2) + np.power(cents[i, 1] - x1[j], 2)) tmpsum += data_od[index, 0][j] if tmpl > 3000: flag = False break if tmpsum > 4000: flag = False if flag == False: break return flag, clf
def kmeans_classification(): x_train, x_test, y_train, y_test = load_digits() # plot some train data N = 25 l = int(np.ceil(np.sqrt(N))) #print(l) im = np.zeros((10 * l, 10 * l)) for m in range(l): for n in range(l): if (m * l + n < N): im[10 * m:10 * m + 8, 10 * n:10 * n + 8] = x_train[m * l + n].reshape([8, 8]) plt.imsave('plots/digits.png', im, cmap='Greys') n_cluster = 30 classifier = KMeansClassifier(n_cluster=n_cluster, max_iter=100, e=1e-6) classifier.fit(x_train, y_train) y_hat_test = classifier.predict(x_test) assert y_hat_test.shape == y_test.shape, \ 'y_hat_test and y_test should have same shape' print('Prediction accuracy of K-means classifier with {} cluster is {}'. format(n_cluster, np.mean(y_hat_test == y_test))) linear_classifier = LogisticRegression() linear_classifier.fit(x_train, y_train) y_hat_test = linear_classifier.predict(x_test) print('Accuracy of logistic regression classifier is {}'.format( np.mean(y_hat_test == y_test))) KNNClassifier = KNeighborsClassifier() KNNClassifier.fit(x_train, y_train) y_hat_test = KNNClassifier.predict(x_test) print('Accuracy of Nearest Neighbour classifier is {}'.format( np.mean(y_hat_test == y_test))) np.savez('results/k_means_classification.npz', y_hat_test=y_hat_test, y_test=y_test, centroids=classifier.centroids, centroid_labels=classifier.centroid_labels)
def fit(self, X): m = X.shape[0] self._clusterAssment = np.zeros((m, 2)) centroid0 = np.mean(X, axis=0).tolist() cenList = [centroid0] for j in range(m): # 计算每个样本点与质心之间初始的平方误差 self._clusterAssment[j, 1] = self._calEDist(np.asarray(centroid0), X[j, :])**2 while (len(cenList) < self._k): lowestSSE = np.inf for i in range(len(cenList)): index_all = self._clusterAssment[:, 0] # 取出样本所属簇的索引值 value = np.nonzero(index_all == i) # 取出所有属于第i个簇的索引点 pstInCurrCluster = X[value[0], :] # 取出属于第i个簇的所有样本点 clf = KMeansClassifier(k=2) clf.fit(pstInCurrCluster) centroidMat, splitClustAss = clf._centroids, clf._clusterAssment sseSplit = sum(splitClustAss[:, 1]) index_all = self._clusterAssment[:, 0] value = np.nonzero(index_all == i) sseNotSplit = sum(self._clusterAssment[value[0], 1]) if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit # 该簇被划分为两个子簇后,其中一个子簇的索引变为原簇的索引 # 另一个子簇的索引变为len(cenList),然后存入cenList。 bestClustAss[np.nonzero(bestClustAss[:, 0] == 1)[0], 0] = len(cenList) bestClustAss[np.nonzero(bestClustAss[:, 0] == 0)[0], 0] = bestCentToSplit cenList[bestCentToSplit] = bestNewCents[0, :].tolist() cenList.append(bestNewCents[1, :]).tolist() self._clusterAssment[np.nonzero( self._clusterAssment[:, 0] == bestCentToSplit)[0], :] = bestClustAss self._labels = self._clusterAssment[:, 0] self._sse = sum(self._clusterAssment[:, 1]) self._centroids = np.asarray(cenList)
def get_cluster(starttime, stoptime, k): ''' # la focntion qui effectue l'algorithme Kmeans # retourne sur un periode et k donne, la datafrme des sation avec leur numeros de cluster. ''' df = get_pandas_df(starttime, stoptime) df_origin = df[1] df = df[0] data_X = np.array(df).astype(np.float) clf = KMeansClassifier(k) clf.fit(data_X) centroids = clf._centroids labels = clf._labels df_origin["label_kmeans"] = labels.astype('int') plt.scatter(df['lon'], df['lat'], c=labels, s=50, alpha=0.5) plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50) plt.show() return df_origin, centroids
@author: liudiwei """ import pandas as pd import numpy as np from kmeans import KMeansClassifier import matplotlib.pyplot as plt #加载数据集,DataFrame格式,最后将返回为一个matrix格式 def loadDataset(infile): df = pd.read_csv(infile, sep='\t', header=0, dtype=str, na_filter=False) return np.array(df).astype(np.float) if __name__=="__main__": data_X = loadDataset(r"data/testSet.txt") k = 3 clf = KMeansClassifier(k) clf.fit(data_X) cents = clf._centroids labels = clf._labels sse = clf._sse colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868'] for i in range(k): index = np.nonzero(labels==i)[0] x0 = data_X[index, 0] x1 = data_X[index, 1] y_i = i for j in range(len(x0)): plt.text(x0[j], x1[j], str(y_i), color=colors[i], \ fontdict={'weight': 'bold', 'size': 6}) plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],\ linewidths=7)
import os from kmeans import KMeansClassifier def load_data(path): df = pd.read_csv(path, sep="\t", header=0, dtype=str, na_filter=False) return np.array(df).astype(np.float) if __name__ == "__main__": project_dir = os.path.dirname(os.path.realpath(__file__)) data = load_data(os.path.join(project_dir, 'data', 'test.txt')) k = 3 classifier = KMeansClassifier(k) classifier.fit(data) centers = classifier._centroids labels = classifier._labels sse = classifier._sse print(labels) print(sse) colors = [ 'b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868' ] for i in range(k): index = np.nonzero(labels == i)[0] x = data[index, 0] y = data[index, 1] for j in range(len(x)): plt.text(x[j],
if __name__=="__main__": #data_X = loadDataset(r"data/testSet.txt") #data_X,label_X = readUCIdata() #data_X,labelreallist,labellist,drawcolorlist,datalistx,datalisty = readUCIdata1('perfume_data.xlsx') #print(data_X[0][1]) trainingdata,testdata = readUCIIris() trainingfeature,traininglabel = splitlabanddata(trainingdata) data_X = np.array(trainingfeature) #k = 4 k=3 print(data_X) #print(data_X[0]) clf = KMeansClassifier(k) clf.fit(data_X) cents = clf._centroids labels = clf._labels sse = clf._sse colors = ['red','purple','darkgreen','darkgray','darksalmon','darkred','olive','yellow','yellowgreen', 'silver','cyan','pink','orangered','orange','navy','magenta','lightgoldenrodyellow', 'lavenderblush','honeydew','mediumseagreen'] print(cents) pred = clf.predict(data_X) print(pred) print("The labels is:",labels) print(traininglabel) colorlist = []
class TSP(object): def __init__(self): self.num_cities=None self.cities=None self.kmeans=KMeansClassifier(k=30) self.cid_to_cities=dict() self.visited_col=None def getdist(self, c1, c2): squared_distance = 0.0 x1=c1[1]; y1=c1[2] x2=c2[1]; y2=c2[2] squared_distance += (x2 - x1) ** 2 squared_distance += (y2 - y1) ** 2 return math.sqrt(squared_distance) def _readdata(self, fname): # import ipdb; ipdb.set_trace() citydata=[] idx_offset=0 with open(fname, 'rb') as f1: reader=csv.reader(f1, delimiter=' ') for line in reader: if int(line[0])==0: idx_offset=0 else: idx_offset=-1 break with open(fname, 'rb') as f: reader=csv.reader(f, delimiter=' ') for line in reader: id=int(line[0])+idx_offset x=float(line[1]) y=float(line[2]) citydata.append([id, x, y]) self.cities=np.array([[x for x in city] for city in citydata]) self.cities[:,0]=self.cities[:,0].astype(int) def _build_cid_to_cities(self, cid): for (i, cl) in enumerate(cid): c=int(cl[0]) if c in self.cid_to_cities: self.cid_to_cities[c].append(i) else: self.cid_to_cities[c]=[i] def prepare_data(self, fname): # import ipdb; ipdb.set_trace() self._readdata(fname) # Train to build cluster print ' Clustering.' cids,_=self.kmeans.train(self.cities[:,1:], epochs=5) self.visited_col=np.zeros((len(self.cities), 1)) self._build_cid_to_cities(cids) print ' Clustering DONE' def closest_city(self, c): inputcity=self.cities[c] # import ipdb; ipdb.set_trace() cids=self.kmeans.get_closest_clusters(inputcity[1:3], num_clusters=50) clustercities=[self.cid_to_cities[i[0]] for i in cids] cdist=99999999 outputcity=c for city in sum(clustercities,[]): dist=self.getdist(self.cities[city], inputcity) if dist==0: continue if self.visited_col[city]==True: continue if dist<cdist: cdist=dist outputcity=city elif dist==cdist: # If found same distance point, take the min index # as outputcity if city<outputcity: outputcity=city return outputcity def traverse(self, c): global lastcity # import ipdb; ipdb.set_trace() if c is None: return self.visited_col[c]=True cc=self.closest_city(c) lastcity=cc dist=self.getdist(self.cities[c], self.cities[cc]) if self.visited_col[cc] == False: self.visited_col[cc]=True dist=dist+self.traverse(cc) return dist def tsp(self): # import ipdb; ipdb.set_trace() dist=self.traverse(0) dist=dist+self.getdist(self.cities[lastcity], self.cities[0]) return dist
def __init__(self): self.num_cities=None self.cities=None self.kmeans=KMeansClassifier(k=30) self.cid_to_cities=dict() self.visited_col=None