def fit(self, data): # 1. 初始化参数 self.W = np.ones((len(data), self.n_clusters)) / self.n_clusters self.pi = [1 / self.n_clusters] * self.n_clusters self.Var = [[1, 1], [1, 1], [1, 1]] # 假设协方差矩阵都是对角阵,且各方向方差一致,可视化为圆形,只存储对角元素 # rand_idx = np.arange(data.shape[0]) # np.random.shuffle(rand_idx) # self.Mu = data[rand_idx[0: self.n_clusters]] # 用k-means初始化均值 my_kmeans = K_Means(n_clusters=3) my_kmeans.fit(data) self.Mu = my_kmeans.get_centers() # 迭代优化: iters = 0 while iters < self.max_iter: # E-step: 计算并更新后验概率 W self.W = self.update_W(data, self.pi, self.Mu, self.Var) # M-step: MLE 更新模型参数 self.pi = self.update_pi(self.W) self.Mu = self.update_Mu(data, self.W) self.Var = self.update_Var(data, self.W, self.Mu) iters += 1 print("iters = ", iters)
def fit(self, data): distance_matrix = self.get_DistanceMatrix(data) adjacent = self.Distance_to_Weigt_knn(distance_matrix, k=10) laplacian_matrix = self.get_LaplacianMatrix(adjacent) Y = self.get_YMatrix(laplacian_matrix) my_kmeans = K_Means(n_clusters=self.n_clusters) my_kmeans.fit(Y) self.labels = my_kmeans.predict(Y)
def fit(self, X, eigValueGap=False, bShowGap=False): W = self.calculateDistanceMatrix(X) Adjacent = self.distTransToWeightKNN(W, k=10) Laplacian = self.calculateLaplacianMatrix(Adjacent, normalized='rm') Y = self.calculateYMatrix(Laplacian, eigValueGap=eigValueGap, bShowGap=bShowGap) MY_KNN = 1 if MY_KNN: knn = K_Means(self.n_clusters) knn.fit(Y) labels = knn.predict(Y) else: labels = KMeans(n_clusters=self.n_clusters).fit(Y).labels_ return labels
def predict(self,data): #第i个点连出的所有线的和 self.D_mat = np.diag(np.sum(self.simi_graph,axis=1)) #print(self.D_mat,self.simi_graph) self.lap_mat = (self.D_mat - self.simi_graph) #self.lap_mat=np.linalg.inv(self.D_mat)*self.simi_graph eigenvalues, eigenvectors = np.linalg.eig(self.lap_mat) sort = eigenvalues.argsort() eigenvalues = eigenvalues[sort] #print("eigenvalues",eigenvalues) #print("eigenvalue",eigenvalues) eigenvectors = eigenvectors[:, sort] k_eigenvectors=eigenvectors[:,:self.k_] #print("eigen vectors", eigenvectors) print("k eigen shape",np.shape(k_eigenvectors)) k_means = K_Means(self.k_) #print("eigen shape",np.shape(k_eigenvectors)) k_means.fit(k_eigenvectors) cat = k_means.predict(k_eigenvectors) return cat
def fit(self, data): data_num = data.shape[0] W = np.zeros((data_num, data_num), dtype=np.float) D = np.zeros((data_num, data_num), dtype=np.float) Dinv = np.zeros((data_num, data_num), dtype=np.float) self.kdtree = KDTree(data) for ii in range(data_num): eular_dis, idx = self.kdtree.query(data[ii, :], k=max(int(data_num / 20), 10)) distance_all = self.distance(eular_dis) W[ii, idx] = distance_all W[ii, ii] = 0 W = np.sqrt(W * W.transpose()) for ii in range(data_num): D[ii, ii] = np.sum(W[ii, :]) if (D[ii, ii] > 0.0001): Dinv[ii, ii] = 1 / D[ii, ii] else: Dinv[ii, ii] = 1 / 0.0001 # Lrw = np.matmul(Dinv, D-W) Lrw = np.eye(data_num) - np.matmul(Dinv, W) # Lrw = D-W value, vector = np.linalg.eig(Lrw) sort_idx = np.argsort(value) k_means_k = self.confirm_k(value, sort_idx) # k_means_k = 2 print('k is evaluated as {}'.format(k_means_k)) print('idx:', sort_idx[0:k_means_k], 'lambda', value[sort_idx[0:k_means_k]]) k_means_data = vector[:, sort_idx[0:k_means_k]] self.k_means_manager = K_Means(k_means_k) self.k_means_manager.fit(k_means_data) self.spectral_result = np.array( self.k_means_manager.predict(k_means_data))
# estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # ============ # 初始化所有聚类算法 # ============ # 自编的K-Means、GMM算法 my_kmeans = K_Means(n_clusters=params['n_clusters']) my_gmm = GMM(n_clusters=params['n_clusters']) my_spec = Spectral() # sklearn中自带的算法 ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size'])
# estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # ============ # 初始化所有聚类算法 # ============ # 自编的K-Means、GMM算法 my_kmeans0 = K_Means(n_clusters=params['n_clusters'], fit_method=0) my_kmeans1 = K_Means(n_clusters=params['n_clusters'], fit_method=1) my_kmeans2 = K_Means(n_clusters=params['n_clusters'], fit_method=2) my_kmeans3 = K_Means(n_clusters=params['n_clusters'], fit_method=3) my_gmm = GMM(n_clusters=params['n_clusters'], dim=X.shape[1]) my_spectral_knn_reciprocal_normalized = SpectralClustering( n_clusters=params['n_clusters'], nnk=50) my_spectral_radius_reciprocal_normalized = SpectralClustering( n_clusters=params['n_clusters'], use_radius_nn=True, nnradius=1) my_spectral_knn_gauss05_normalized = SpectralClustering( n_clusters=params['n_clusters'], nnk=50, use_gauss_dist=True) my_spectral_knn_gauss005_normalized = SpectralClustering( n_clusters=params['n_clusters'], nnk=50, use_gauss_dist=True, gauss_sigma=5e-2)
theta = random.uniform(-2 * np.pi, 2 * np.pi) R = random.uniform(0, r) data.append([x + R * np.cos(theta), y + R * np.sin(theta)]) return data if __name__ == '__main__': GMM = GMM_Cluster() data = [] data += get_data(-10, 15, 5, 50) data += (get_data(-10, -10, 5, 50)) data += (get_data(10, 12, 5, 50)) data += get_data(5, -5, 5, 50) GMM.train(data, 4) KM = K_Means() KM.train(data, 4) plt.title('GMM Clustering') plt.xlim(-20, 20) plt.ylim(-20, 20) plt.plot([x[0] for x in GMM.Clusters[0]], [x[1] for x in GMM.Clusters[0]], 'o', color='red') plt.plot([x[0] for x in GMM.Clusters[1]], [x[1] for x in GMM.Clusters[1]], 'o', color='green') plt.plot([x[0] for x in GMM.Clusters[2]], [x[1] for x in GMM.Clusters[2]], 'o', color='blue') plt.plot([x[0] for x in GMM.Clusters[3]], [x[1] for x in GMM.Clusters[3]], 'o',
class Spectral(object): def __init__(self): self.tolerance_ = 0.00001 # def distance(self, eular_dis): # return 1 / (eular_dis + 0.000001) def distance(self, eular_dis): return np.exp(-eular_dis) def confirm_k(self, value, sort_idx): sum_diff = value[sort_idx[0]] - value[sort_idx[1]] sorted_value = value[sort_idx] prev = sorted_value[0:-1] last = sorted_value[1:] diff = last - prev mean_diff = np.mean(diff[0:5]) kk = 1 for kk in range(1, value.shape[0]): curr_diff = sorted_value[kk] - sorted_value[kk - 1] if (curr_diff > mean_diff): break sum_diff += curr_diff return kk def fit(self, data): data_num = data.shape[0] W = np.zeros((data_num, data_num), dtype=np.float) D = np.zeros((data_num, data_num), dtype=np.float) Dinv = np.zeros((data_num, data_num), dtype=np.float) self.kdtree = KDTree(data) for ii in range(data_num): eular_dis, idx = self.kdtree.query(data[ii, :], k=max(int(data_num / 20), 10)) distance_all = self.distance(eular_dis) W[ii, idx] = distance_all W[ii, ii] = 0 W = np.sqrt(W * W.transpose()) for ii in range(data_num): D[ii, ii] = np.sum(W[ii, :]) if (D[ii, ii] > 0.0001): Dinv[ii, ii] = 1 / D[ii, ii] else: Dinv[ii, ii] = 1 / 0.0001 # Lrw = np.matmul(Dinv, D-W) Lrw = np.eye(data_num) - np.matmul(Dinv, W) # Lrw = D-W value, vector = np.linalg.eig(Lrw) sort_idx = np.argsort(value) k_means_k = self.confirm_k(value, sort_idx) # k_means_k = 2 print('k is evaluated as {}'.format(k_means_k)) print('idx:', sort_idx[0:k_means_k], 'lambda', value[sort_idx[0:k_means_k]]) k_means_data = vector[:, sort_idx[0:k_means_k]] self.k_means_manager = K_Means(k_means_k) self.k_means_manager.fit(k_means_data) self.spectral_result = np.array( self.k_means_manager.predict(k_means_data)) # plt.imshow(W, vmin=0, vmax=100) # plt.show() # plt.plot(k_means_data[:,0], k_means_data[:,1],'r.') # plt.plot(value[sort_idx], 'r.') # plt.show() # exit(0) def predict(self, data): ret = [] # convert to spectral data for ii in range(data.shape[0]): distance, idx = self.kdtree.query(data[ii, :], k=1) ret.append(self.spectral_result[idx]) # print(spec_data) return ret
[[12, 11], 'C'], [[3, 4], 'B'], [[10, 12], 'C'], [[6, 7], 'A']] qua3_test_data = [[12, 20], [8, 9], [13, 14], [7, 5], [9, 11]] def get_data(x, y, r, num): data = [] for i in range(num): theta = random.uniform(-2 * np.pi, 2 * np.pi) R = random.uniform(0, r) data.append([x + R * np.cos(theta), y + R * np.sin(theta)]) return data model = K_Means() data = [] data += get_data(-10, 10, 8, 50) data += (get_data(0, -5, 8, 50)) data += (get_data(10, 10, 8, 50)) #for i in range(num): # x = random.uniform(-20,20) # y = random.uniform(-20,20) # data.append([x,y]) model.train(data, k=3, metric='Euc') clusters = [] for c in model.C: clusters.append(c) ave = model.Ave #print(clusters)
class SpectralCluster(object): # k是分组数 def __init__(self, n_clusters=2): self.k_ = n_clusters self.kmeans = K_Means(n_clusters=n_clusters) def squared_exponential(self, x, y, sig=0.8, sig2=1): norm = np.linalg.norm(x - y) dist = norm * norm return np.exp(-dist / (2 * sig * sig2)) def affinity(self, data): N = data.shape[0] sig = [] ans = np.zeros((N, N)) for i in range(N): dists = [] for j in range(N): dis = np.linalg.norm(data[i, :] - data[j, :]) dists.append(dis) dists.sort() sig.append(np.mean(dists[:5])) for i in range(N): for j in range(N): ans[i][j] = self.squared_exponential(data[i], data[j], sig[i], sig[j]) return ans def affinity_fast(self, data): N = data.shape[0] sig = [] ans = np.zeros((N, N)) dists = distance.cdist(data, data) dists.sort() sig = np.mean(dists[:, :5], axis=1) # neighour of 5 distances as variance for i in range(N): for j in range(N): ans[i][j] = self.squared_exponential(data[i], data[j], sig[i], sig[j]) return ans def get_laplacian_features(self, data): N = data.shape[0] W = self.affinity_fast(data) D_half_inv = np.zeros(W.shape) tmp = np.sum(W, axis=1) D_half_inv.flat[::len(tmp) + 1] = tmp**(-0.5) #import pdb; pdb.set_trace() L = D_half_inv.dot(W).dot(D_half_inv) #graph laplacian w, v = scipy.sparse.linalg.eigs(L, self.k_) X = v.real rows_norm = np.linalg.norm(X, axis=1, ord=2) X = (X.T / rows_norm).T return X def fit(self, data): V = self.get_laplacian_features(data) self.kmeans.fit(V) def predict(self, data): V = self.get_laplacian_features(data) return self.kmeans.predict(V)
def __init__(self, n_clusters=2): self.k_ = n_clusters self.kmeans = K_Means(n_clusters=n_clusters)