sum_all += d[j] # 5、取得sum_all之间的随机值 sum_all *= random() # 6、获得距离最远的样本点作为聚类中心点 for j, di in enumerate(d): sum_all -= di if sum_all > 0: continue cluster_centers[i] = np.copy(points[j, ]) break return cluster_centers if __name__ == "__main__": k = 4 # 聚类中心的个数 file_path = "data.txt" # 1、导入数据 print("---------- 1.load data ------------") data = load_data(file_path) # 2、KMeans++的聚类中心初始化方法 print("---------- 2.K-Means++ generate centers ------------") centroids = get_centroids(data, k) # 3、聚类计算 print("---------- 3.kmeans ------------") subCenter = kmeans(data, k, centroids) # 4、保存所属的类别文件 print("---------- 4.save subCenter ------------") save_result("sub_pp", subCenter) # 5、保存聚类中心 print("---------- 5.save centroids ------------") save_result("center_pp", centroids)
sum_all += d[j] # 5、取得sum_all之间的随机值 sum_all *= random() # 6、获得距离最远的样本点作为聚类中心点 for j, di in enumerate(d): sum_all -= di if sum_all > 0: continue cluster_centers[i] = np.copy(points[j, ]) break return cluster_centers if __name__ == "__main__": k = 4#聚类中心的个数 file_path = "data.txt" # 1、导入数据 print "---------- 1.load data ------------" data = load_data(file_path) # 2、KMeans++的聚类中心初始化方法 print "---------- 2.K-Means++ generate centers ------------" centroids = get_centroids(data, k) # 3、聚类计算 print "---------- 3.kmeans ------------" subCenter = kmeans(data, k, centroids) # 4、保存所属的类别文件 print "---------- 4.save subCenter ------------" save_result("sub_pp", subCenter) # 5、保存聚类中心 print "---------- 5.save centroids ------------" save_result("center_pp", centroids)
def getCentroids(points, k): m, n = np.shape(points) cluster_centers = np.mat(np.zeros((k, n))) index = np.random.randint(0, m) cluster_centers[0, ] = np.copy(points[index, ]) d = [0.0 for _ in range(m)] for i in range(1, k): sum_all = 0 for j in range(m): d[j] = nearest(points[j, ], cluster_centers[0:i, ]) sum_all += d[j] sum_all *= random() for j, di in enumerate(d): sum_all -= di if sum_all > 0: continue cluster_centers[i] = np.copy(points[j, ]) break return cluster_centers if __name__ == '__main__': k = 4 file_path = '../../../train_data/KMeans.train' data = load_data(file_path) centroids = getCentroids(data, k) _, subCenter = kmeans(data, k, centroids) save_result('../../../model/model.kmcpp.sub', subCenter) save_result('../../../model/model.kmcpp.center', centroids)