sum_all += d[j]
        # 5、取得sum_all之间的随机值
        sum_all *= random()
        # 6、获得距离最远的样本点作为聚类中心点
        for j, di in enumerate(d):
            sum_all -= di
            if sum_all > 0:
                continue
            cluster_centers[i] = np.copy(points[j, ])
            break
    return cluster_centers

if __name__ == "__main__":
    k = 4 # 聚类中心的个数
    file_path = "data.txt"
    # 1、导入数据
    print("---------- 1.load data ------------")
    data = load_data(file_path)
    # 2、KMeans++的聚类中心初始化方法
    print("---------- 2.K-Means++ generate centers ------------")
    centroids = get_centroids(data, k)
    # 3、聚类计算
    print("---------- 3.kmeans ------------")
    subCenter = kmeans(data, k, centroids)
    # 4、保存所属的类别文件
    print("---------- 4.save subCenter ------------")
    save_result("sub_pp", subCenter)
    # 5、保存聚类中心
    print("---------- 5.save centroids ------------")
    save_result("center_pp", centroids)
            sum_all += d[j]
        # 5、取得sum_all之间的随机值
        sum_all *= random()
        # 6、获得距离最远的样本点作为聚类中心点
        for j, di in enumerate(d):
            sum_all -= di
            if sum_all > 0:
                continue
            cluster_centers[i] = np.copy(points[j, ])
            break
    return cluster_centers

if __name__ == "__main__":
    k = 4#聚类中心的个数
    file_path = "data.txt"
    # 1、导入数据
    print "---------- 1.load data ------------"
    data = load_data(file_path)
    # 2、KMeans++的聚类中心初始化方法
    print "---------- 2.K-Means++ generate centers ------------"
    centroids = get_centroids(data, k)
    # 3、聚类计算
    print "---------- 3.kmeans ------------"
    subCenter = kmeans(data, k, centroids)
    # 4、保存所属的类别文件
    print "---------- 4.save subCenter ------------"
    save_result("sub_pp", subCenter)
    # 5、保存聚类中心
    print "---------- 5.save centroids ------------"
    save_result("center_pp", centroids)
def getCentroids(points, k):
    m, n = np.shape(points)
    cluster_centers = np.mat(np.zeros((k, n)))
    index = np.random.randint(0, m)
    cluster_centers[0, ] = np.copy(points[index, ])
    d = [0.0 for _ in range(m)]
    for i in range(1, k):
        sum_all = 0
        for j in range(m):
            d[j] = nearest(points[j, ], cluster_centers[0:i, ])
            sum_all += d[j]
        sum_all *= random()
        for j, di in enumerate(d):
            sum_all -= di
            if sum_all > 0:
                continue
            cluster_centers[i] = np.copy(points[j, ])
            break
    return cluster_centers


if __name__ == '__main__':
    k = 4
    file_path = '../../../train_data/KMeans.train'
    data = load_data(file_path)
    centroids = getCentroids(data, k)
    _, subCenter = kmeans(data, k, centroids)
    save_result('../../../model/model.kmcpp.sub', subCenter)
    save_result('../../../model/model.kmcpp.center', centroids)