def fit(self, data): # step1 construct weight matrix for every point #weight = kneighbors_graph(data, n_neighbors = self.n_neighbors_,mode='distance', include_self = False) weight = kneighbors_graph(data, n_neighbors = self.n_neighbors_,mode='connectivity', include_self = False) weight = 0.5 * (weight + weight.T) self.weight_ = weight.toarray() self.degree_ = np.diag(np.sum(self.weight_, axis = 0).ravel()) # step2 construct Laplacian matrix for every point, and normalize self.laplacians_ = self.degree_ - self.weight_ #unit_arrary = np.ones([data.shape[0],data.shape[0]],dtype=np.float64) #with np.errstate(divide='ignore'): # degree_nor = unit_arrary/np.sqrt(self.degree_) # degree_nor[self.degree_ == 0] = 0 degree_nor=np.sqrt(np.linalg.inv(self.degree_)) self.laplacians_ = np.dot(degree_nor, self.laplacians_) self.laplacians_ = np.dot(self.laplacians_, degree_nor)#normalize #step3 compute minimun k eigenvalues corresponding to eigenvectors and normalize eigen_values, eigen_vector = np.linalg.eigh(self.laplacians_) sort_index = eigen_values.argsort() eigen_vector = eigen_vector[:,sort_index] self.eigen_vector_ = np.asarray([eigen_vector[:,i] for i in range(self.n_clusters_)]).T #self.eigen_vector_ /= np.sqrt(np.sum(self.eigen_vector_**2, axis = 1)).reshape(data.shape[0], 1 ) self.eigen_vector_ /= np.linalg.norm(self.eigen_vector_, axis=1).reshape(data.shape[0], 1 ) #step4 kmeans with eigenvectors spectral_kmeans = KMeans.K_Means(n_clusters=self.n_clusters_) spectral_kmeans.fit(self.eigen_vector_) spectral_label = spectral_kmeans.predict(self.eigen_vector_) self.label_ = spectral_label self.fitted = True
def fit(self, data): # 作业3 # 屏蔽开始 # step1: initial the attribute of gmm by kmeans k_means = KMeans.K_Means(self.n_clusters_) k_means.fit(data) self.mu_ = np.asarray(k_means.centers_) print(self.n_clusters_) self.prior_ = np.asarray([1 / self.n_clusters_] * self.n_clusters_).reshape( self.n_clusters_, 1) self.posteriori_ = np.zeros((self.n_clusters_, len(data))) self.cov_ = np.asarray([eye(2, 2)] * self.n_clusters_) # step2:iteration Likelihood_value_before = -inf for i in range(self.max_iter_): # step3: E-step generate probability density distribution for every point and normalize print("gmm iterator:", i) for k in range(self.n_clusters_): self.posteriori_[k] = multivariate_normal.pdf(x=data, mean=self.mu_[k], cov=self.cov_[k]) self.posteriori_ = np.dot(diag(self.prior_.ravel()), self.posteriori_) self.posteriori_ /= np.sum(self.posteriori_, axis=0) #posteriori=np.asarray(self.posteriori_) #print(posteriori.shape) # step4: M-step update the parameters of generate probability density distribution for every point int E-step and stop when reached threshold self.Nk_ = np.sum(self.posteriori_, axis=1) self.mu_ = np.asarray([ np.dot(self.posteriori_[k], data) / self.Nk_[k] for k in range(self.n_clusters_) ]) self.cov_ = np.asarray([ np.dot((data - self.mu_[k]).T, np.dot(np.diag(self.posteriori_[k].ravel()), data - self.mu_[k])) / self.Nk_[k] for k in range(self.n_clusters_) ]) self.prior_ = np.asarray([self.Nk_ / self.n_clusters_ ]).reshape(self.n_clusters_, 1) Likelihood_value_after = np.sum(np.log(self.posteriori_)) print(Likelihood_value_after - Likelihood_value_before) if np.abs(Likelihood_value_after - Likelihood_value_before ) < self.tolerance_ * self.n_clusters_: break Likelihood_value_before = np.copy(Likelihood_value_after) self.fitted = True
def fit(self, data): # 作业3 # 屏蔽开始 # step1 初始化 Mu pi cov ##init Mu ,使用K-means中心点 k_means = KMeans.K_Means(n_clusters=self.k) k_means.fit(data) self.mu = np.asarray( k_means.centers_) # 将mean的初始值为 k-means 的中心点 3*2 矩阵 self.cov = np.asarray([eye(2, 2)] * self.k) #初始化的cov为 3*2*2 的单位矩阵 self.prior = np.asarray([1 / self.k] * self.k).reshape( 3, 1) #对pi进行均等分 3*1 矩阵 self.posteriori = np.zeros((self.k, len(data))) #初始化 后验概率为 K*N的 矩阵 for _ in range(self.max_iter): #迭代 #step2 E-step 算出后验概率posteriori --一个点属于哪个类的概率 for k in range(self.k): self.posteriori[k] = multivariate_normal.pdf( x=data, mean=self.mu[k], cov=self.cov[k]) #提取每个点的概率密度分布 self.posteriori = np.dot( diag(self.prior.ravel()), self.posteriori ) #diag 将一维数组元素放在对角线上,方便进行对应的数据乘法运算 变为3*3对角矩阵 3*3 * 3*N = 3*N,ravel 将矩阵里所有元素变为列表 ##归一化 self.posteriori /= np.sum(self.posteriori, axis=0) #后验概率,3*N矩阵 #step3 M-step 使用MLE 算出高斯模型三个参数 mu:mean cov:协方差 prior:先验概率 self.Nk = np.sum(self.posteriori, axis=1) self.mu = np.asarray([ np.dot(self.posteriori[k], data) / self.Nk[k] for k in range(self.k) ]) #self.posteriori[k]: 3*2 data:n*2 self.Nk[k]:1 self.cov = np.asarray([ np.dot((data - self.mu[k]).T, np.dot(np.diag(self.posteriori[k].ravel()), data - self.mu[k])) / self.Nk[k] for k in range(self.k) ]) #sel.cov : 3*2*2 self.prior = np.asarray([self.Nk / self.k ]).reshape(3, 1) #self.prior 3*1 self.fitted = True
def fit(self, data): # 作业3 # 屏蔽开始 # Initialization n_clusters = self.n_clusters_ n_points = len(data) D = data.shape[1] # 随机在N个数据点中选取k个初始点 # Kmeans 轮盘法选初始点 #Mu = km.get_initial(data, n_clusters) # 聚类中心 #seed_idx = random.sample(list(range(n_points)),n_clusters) #for seed in seed_idx: # Mu.append(data[seed,:]) kmean = km.K_Means(n_clusters=n_clusters, max_iter=30) kmean.fit(data) Mu = kmean.cluster_center Var = np.asarray([np.cov(data, rowvar=False)] * n_clusters) # 方差: K*D*D #Var = np.ones((n_clusters, D, D)) pi = [1 / n_clusters ] * n_clusters # 每一个cluster的比重: pi =[1/k, 1/k, 1/k] w = np.ones((n_points, n_clusters)) / n_clusters # 每一个变量分类权重 #pi = w.sum(axis = 0) / w.sum() #迭代求解 log_p = 1 old_log_p = 0 loglh = [] time_w, time_pi, time_mu, time_var = 0, 0, 0, 0 for i in range(self.max_iter_): #self.plot_clusters(X, Mu, Var) old_log_p = log_p # E step # Update weight (posterior) time_start = time.time() w = self.update_w(data, Mu, Var, pi) # time_w += time.time() - time_start # M step # 更新pi time_start = time.time() pi = self.update_pi(w) time_pi += time.time() - time_start # 更新聚类中心 time_start = time.time() Mu = self.update_mu(data, w) time_mu += time.time() - time_start # 更新协方差矩阵 time_start = time.time() Var = self.update_var(data, Mu, w) time_var += time.time() - time_start log_p = self.get_log(data, pi, Mu, Var) #loglh.append(log_p) #print('log-likehood:%.3f'%loglh[-1]) if abs(log_p - old_log_p) < 0.001: #print(i) break # update parameters self.w_ = w self.pi_ = pi self.Mu_ = Mu self.Var_ = Var print("时间:", time_w, time_pi, time_mu, time_var)
k3 = np.array(C3) plt.scatter(k1[:, 0], k1[:, 1], s=5) plt.scatter(k2[:, 0], k2[:, 1], s=5) plt.scatter(k3[:, 0], k3[:, 1], s=5) plt.show() return X if __name__ == '__main__': # 生成数据 true_Mu = [[0.5, 0.5], [5.5, 2.5], [1, 7]] true_Var = [[1, 3], [2, 2], [6, 2]] X = generate_X(true_Mu, true_Var) # K-means kmeans = km.K_Means(n_clusters=3) kmeans.fit(X) cat = kmeans.predict(X) print(cat) #show_cluster(cat, X) # 显示预测结果 # GMM gmm = GMM(n_clusters=3) gmm.fit(X) cat = gmm.predict(X) print(cat) show_cluster(cat, X) spectral_clustering = sc.SC(n_clusters=3, knn_k=5) spectral_clustering.fit(X)