Beispiel #1
0
def kmeans(samples, k, cutoff):
    """
        kmeans函数
    """

    # 随机选k个样本点作为初始聚类中心
    init_samples = random.sample(samples, k)

    # 创建k个聚类,聚类的中心分别为随机初始的样本点
    clusters = [Cluster([sample]) for sample in init_samples]

    # 迭代循环直到聚类划分稳定
    n_loop = 0
    while True:
        # 初始化一组空列表用于存储每个聚类内的样本点
        lists = [[] for _ in clusters]

        # 开始迭代
        n_loop += 1
        # 遍历样本集中的每个样本
        for sample in samples:
            # 计算样本点sample和第一个聚类中心的距离
            smallest_distance = get_distance(sample, clusters[0].centroid)
            # 初始化属于聚类 0
            cluster_index = 0

            # 计算和其他聚类中心的距离
            for i in range(k - 1):
                # 计算样本点sample和聚类中心的距离
                distance = get_distance(sample, clusters[i + 1].centroid)
                # 如果存在更小的距离,更新距离
                if distance < smallest_distance:
                    smallest_distance = distance
                    cluster_index = i + 1

            # 找到最近的聚类中心,更新所属聚类
            lists[cluster_index].append(sample)

        # 初始化最大移动距离
        biggest_shift = 0.0

        # 计算本次迭代中,聚类中心移动的距离
        for i in range(k):
            shift = clusters[i].update(lists[i])
            # 记录最大移动距离
            biggest_shift = max(biggest_shift, shift)

        # 如果聚类中心移动的距离小于收敛阈值,即:聚类稳定
        if biggest_shift < cutoff:
            print("第{}次迭代后,聚类稳定。".format(n_loop))
            break
    # 返回聚类结果
    return clusters
Beispiel #2
0
def kmeans(samples, k, cutoff):
    """
    the function of kmeans
    """
    # first ramdon choose k samples
    init_samples = random.sample(samples, k)

    # construct k clustering, and the initial random sample as centroid
    clusters = [Cluster([sample]) for sample in init_samples]

    # iterate untill reach steady state
    n_loop = 0
    while True:
        lists = [[] for _ in clusters]

        # starts
        n_loop += 1
        for sample in samples:
            smallest_distance = get_distance(sample, clusters[0].centroid)
            cluster_index = 0

            for i in range(k - 1):
                distance = get_distance(sample, clusters[i + 1].centroid)
                if distance < smallest_distance:
                    smallest_distance = distance
                    cluster_index = i + 1

            # find the centroid and update the cluster
            lists[cluster_index].append(sample)

            # initial the shift max distance
            biggest_shift = 0.0

        for i in range(k):
            shift = clusters[i].update(lists[i])
            # record the biggest shift distance
            biggest_shift = max(biggest_shift, shift)

            # if the shift distance is smaller than cutoff, the clustering stable
            if biggest_shift < cutoff:
                print("{} iterate, stable.".format(n_loop))
                break

        return clusters