Example #1
0
def agglomerative_l_method(X, method='ward'):
    # library: fastcluster
    merge_hist = linkage(X, method=method, metric='euclidean', preserve_input=True)

    # reorder to be x [2->N]
    num_groups = [i for i in range(2, len(X) + 1)]
    merge_dist = list(reversed([each[2] for each in merge_hist]))

    cluster_count = refined_l_method(num_groups, merge_dist)

    # print('refined_l_method time:', end_time - start_time)
    # print('cluster_count:', cluster_count)

    # make clusters by merging them according to merge_hist
    disjoint = DisjointSet(len(X))
    for a, b, _, _ in islice(merge_hist, 0, len(X) - cluster_count):
        a, b = int(a), int(b)
        disjoint.join(a, b)

    # get cluster name for each instance
    belong_to = [disjoint.parent(i) for i in range(len(X))]
    # print('belong_to:', belong_to)
    # counter = Counter(belong_to)
    # print('belong_to:', counter)

    # rename the cluster name to be 0 -> cluster_count - 1
    cluster_map = {}
    cluster_name = 0
    belong_to_renamed = []
    for each in belong_to:
        if not each in cluster_map:
            cluster_map[each] = cluster_name
            cluster_name += 1
        belong_to_renamed.append(cluster_map[each])

    # print('belong_to_renamed:', belong_to_renamed)

    centroids = get_centroids(X, belong_to_renamed)
    # print('centroids:', centroids)

    return Result(belong_to_renamed, centroids)
    def fit(self, X, max_merge_dist):
        self.X = X
        self.max_merge_dist = max_merge_dist

        merge_hist = linkage(X, method='ward', metric='euclidean', preserve_input=True)

        disjoint = DisjointSet(len(X))

        # _, _, merge_dists, _ = list(zip(*merge_hist))
        # print('merge_dists:', merge_dists)

        for a, b, merge_dist, _ in merge_hist:
            if merge_dist > max_merge_dist:
                break

            a, b = int(a), int(b)
            disjoint.join(a, b)

        belong_to = [disjoint.parent(i) for i in range(len(X))]

        # rename the cluster name to be 0 -> cluster_count - 1
        cluster_map = {}
        cluster_name = 0
        belong_to_renamed = []
        for each in belong_to:
            if not each in cluster_map:
                cluster_map[each] = cluster_name
                cluster_name += 1
            belong_to_renamed.append(cluster_map[each])

        # print('belong_to_renamed:', belong_to_renamed)

        centroids, cluster_member_cnt = self.get_centroids(X, belong_to_renamed)
        self.cluster_centers_ = centroids

        # print('centroids:', centroids)
        # print('cluster_member_cnt:', cluster_member_cnt)

        return centroids, cluster_member_cnt