Beispiel #1
0
    def fit(self, data):
        # 1. 初始化参数
        self.W = np.ones((len(data), self.n_clusters)) / self.n_clusters
        self.pi = [1 / self.n_clusters] * self.n_clusters
        self.Var = [[1, 1], [1, 1],
                    [1, 1]]  # 假设协方差矩阵都是对角阵,且各方向方差一致,可视化为圆形,只存储对角元素

        # rand_idx = np.arange(data.shape[0])
        # np.random.shuffle(rand_idx)
        # self.Mu = data[rand_idx[0: self.n_clusters]]

        # 用k-means初始化均值
        my_kmeans = K_Means(n_clusters=3)
        my_kmeans.fit(data)
        self.Mu = my_kmeans.get_centers()

        # 迭代优化:
        iters = 0
        while iters < self.max_iter:
            # E-step: 计算并更新后验概率 W
            self.W = self.update_W(data, self.pi, self.Mu, self.Var)

            # M-step: MLE 更新模型参数
            self.pi = self.update_pi(self.W)
            self.Mu = self.update_Mu(data, self.W)
            self.Var = self.update_Var(data, self.W, self.Mu)

            iters += 1
            print("iters = ", iters)
    def fit(self, data):
        distance_matrix = self.get_DistanceMatrix(data)
        adjacent = self.Distance_to_Weigt_knn(distance_matrix, k=10)
        laplacian_matrix = self.get_LaplacianMatrix(adjacent)
        Y = self.get_YMatrix(laplacian_matrix)

        my_kmeans = K_Means(n_clusters=self.n_clusters)
        my_kmeans.fit(Y)
        self.labels = my_kmeans.predict(Y)
Beispiel #3
0
    def fit(self, X, eigValueGap=False, bShowGap=False):
        W = self.calculateDistanceMatrix(X)
        Adjacent = self.distTransToWeightKNN(W, k=10)
        Laplacian = self.calculateLaplacianMatrix(Adjacent, normalized='rm')
        Y = self.calculateYMatrix(Laplacian,
                                  eigValueGap=eigValueGap,
                                  bShowGap=bShowGap)

        MY_KNN = 1
        if MY_KNN:
            knn = K_Means(self.n_clusters)
            knn.fit(Y)
            labels = knn.predict(Y)
        else:
            labels = KMeans(n_clusters=self.n_clusters).fit(Y).labels_
        return labels
Beispiel #4
0
 def predict(self,data):
     #第i个点连出的所有线的和
     self.D_mat = np.diag(np.sum(self.simi_graph,axis=1))
     #print(self.D_mat,self.simi_graph)
     self.lap_mat = (self.D_mat - self.simi_graph)
     #self.lap_mat=np.linalg.inv(self.D_mat)*self.simi_graph
     eigenvalues, eigenvectors = np.linalg.eig(self.lap_mat)
     sort = eigenvalues.argsort()
     eigenvalues = eigenvalues[sort]
     #print("eigenvalues",eigenvalues)
     #print("eigenvalue",eigenvalues)
     eigenvectors = eigenvectors[:, sort]
     k_eigenvectors=eigenvectors[:,:self.k_]
     #print("eigen vectors", eigenvectors)
     print("k eigen shape",np.shape(k_eigenvectors))
     k_means = K_Means(self.k_)
     #print("eigen shape",np.shape(k_eigenvectors))
     k_means.fit(k_eigenvectors)
     cat = k_means.predict(k_eigenvectors)
     return cat
Beispiel #5
0
    def fit(self, data):
        data_num = data.shape[0]
        W = np.zeros((data_num, data_num), dtype=np.float)
        D = np.zeros((data_num, data_num), dtype=np.float)
        Dinv = np.zeros((data_num, data_num), dtype=np.float)
        self.kdtree = KDTree(data)
        for ii in range(data_num):
            eular_dis, idx = self.kdtree.query(data[ii, :],
                                               k=max(int(data_num / 20), 10))
            distance_all = self.distance(eular_dis)
            W[ii, idx] = distance_all
            W[ii, ii] = 0
        W = np.sqrt(W * W.transpose())
        for ii in range(data_num):
            D[ii, ii] = np.sum(W[ii, :])
            if (D[ii, ii] > 0.0001):
                Dinv[ii, ii] = 1 / D[ii, ii]
            else:
                Dinv[ii, ii] = 1 / 0.0001

        # Lrw = np.matmul(Dinv, D-W)
        Lrw = np.eye(data_num) - np.matmul(Dinv, W)
        # Lrw = D-W
        value, vector = np.linalg.eig(Lrw)
        sort_idx = np.argsort(value)

        k_means_k = self.confirm_k(value, sort_idx)
        # k_means_k = 2
        print('k is evaluated as {}'.format(k_means_k))

        print('idx:', sort_idx[0:k_means_k], 'lambda',
              value[sort_idx[0:k_means_k]])
        k_means_data = vector[:, sort_idx[0:k_means_k]]

        self.k_means_manager = K_Means(k_means_k)
        self.k_means_manager.fit(k_means_data)
        self.spectral_result = np.array(
            self.k_means_manager.predict(k_means_data))
    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X,
                                    n_neighbors=params['n_neighbors'],
                                    include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # ============
    # 初始化所有聚类算法
    # ============
    # 自编的K-Means、GMM算法
    my_kmeans = K_Means(n_clusters=params['n_clusters'])
    my_gmm = GMM(n_clusters=params['n_clusters'])
    my_spec = Spectral()
    # sklearn中自带的算法
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                           linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    optics = cluster.OPTICS(min_samples=params['min_samples'],
                            xi=params['xi'],
                            min_cluster_size=params['min_cluster_size'])
Beispiel #7
0
    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X,
                                    n_neighbors=params['n_neighbors'],
                                    include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # ============
    # 初始化所有聚类算法
    # ============
    # 自编的K-Means、GMM算法
    my_kmeans0 = K_Means(n_clusters=params['n_clusters'], fit_method=0)
    my_kmeans1 = K_Means(n_clusters=params['n_clusters'], fit_method=1)
    my_kmeans2 = K_Means(n_clusters=params['n_clusters'], fit_method=2)
    my_kmeans3 = K_Means(n_clusters=params['n_clusters'], fit_method=3)
    my_gmm = GMM(n_clusters=params['n_clusters'], dim=X.shape[1])
    my_spectral_knn_reciprocal_normalized = SpectralClustering(
        n_clusters=params['n_clusters'], nnk=50)
    my_spectral_radius_reciprocal_normalized = SpectralClustering(
        n_clusters=params['n_clusters'], use_radius_nn=True, nnradius=1)
    my_spectral_knn_gauss05_normalized = SpectralClustering(
        n_clusters=params['n_clusters'], nnk=50, use_gauss_dist=True)
    my_spectral_knn_gauss005_normalized = SpectralClustering(
        n_clusters=params['n_clusters'],
        nnk=50,
        use_gauss_dist=True,
        gauss_sigma=5e-2)
Beispiel #8
0
        theta = random.uniform(-2 * np.pi, 2 * np.pi)
        R = random.uniform(0, r)
        data.append([x + R * np.cos(theta), y + R * np.sin(theta)])
    return data


if __name__ == '__main__':
    GMM = GMM_Cluster()
    data = []
    data += get_data(-10, 15, 5, 50)
    data += (get_data(-10, -10, 5, 50))
    data += (get_data(10, 12, 5, 50))
    data += get_data(5, -5, 5, 50)
    GMM.train(data, 4)

    KM = K_Means()
    KM.train(data, 4)
    plt.title('GMM Clustering')
    plt.xlim(-20, 20)
    plt.ylim(-20, 20)
    plt.plot([x[0] for x in GMM.Clusters[0]], [x[1] for x in GMM.Clusters[0]],
             'o',
             color='red')
    plt.plot([x[0] for x in GMM.Clusters[1]], [x[1] for x in GMM.Clusters[1]],
             'o',
             color='green')
    plt.plot([x[0] for x in GMM.Clusters[2]], [x[1] for x in GMM.Clusters[2]],
             'o',
             color='blue')
    plt.plot([x[0] for x in GMM.Clusters[3]], [x[1] for x in GMM.Clusters[3]],
             'o',
Beispiel #9
0
class Spectral(object):
    def __init__(self):
        self.tolerance_ = 0.00001

    # def distance(self, eular_dis):
    #     return 1 / (eular_dis + 0.000001)
    def distance(self, eular_dis):
        return np.exp(-eular_dis)

    def confirm_k(self, value, sort_idx):
        sum_diff = value[sort_idx[0]] - value[sort_idx[1]]
        sorted_value = value[sort_idx]
        prev = sorted_value[0:-1]
        last = sorted_value[1:]
        diff = last - prev
        mean_diff = np.mean(diff[0:5])
        kk = 1
        for kk in range(1, value.shape[0]):
            curr_diff = sorted_value[kk] - sorted_value[kk - 1]
            if (curr_diff > mean_diff):
                break
            sum_diff += curr_diff
        return kk

    def fit(self, data):
        data_num = data.shape[0]
        W = np.zeros((data_num, data_num), dtype=np.float)
        D = np.zeros((data_num, data_num), dtype=np.float)
        Dinv = np.zeros((data_num, data_num), dtype=np.float)
        self.kdtree = KDTree(data)
        for ii in range(data_num):
            eular_dis, idx = self.kdtree.query(data[ii, :],
                                               k=max(int(data_num / 20), 10))
            distance_all = self.distance(eular_dis)
            W[ii, idx] = distance_all
            W[ii, ii] = 0
        W = np.sqrt(W * W.transpose())
        for ii in range(data_num):
            D[ii, ii] = np.sum(W[ii, :])
            if (D[ii, ii] > 0.0001):
                Dinv[ii, ii] = 1 / D[ii, ii]
            else:
                Dinv[ii, ii] = 1 / 0.0001

        # Lrw = np.matmul(Dinv, D-W)
        Lrw = np.eye(data_num) - np.matmul(Dinv, W)
        # Lrw = D-W
        value, vector = np.linalg.eig(Lrw)
        sort_idx = np.argsort(value)

        k_means_k = self.confirm_k(value, sort_idx)
        # k_means_k = 2
        print('k is evaluated as {}'.format(k_means_k))

        print('idx:', sort_idx[0:k_means_k], 'lambda',
              value[sort_idx[0:k_means_k]])
        k_means_data = vector[:, sort_idx[0:k_means_k]]

        self.k_means_manager = K_Means(k_means_k)
        self.k_means_manager.fit(k_means_data)
        self.spectral_result = np.array(
            self.k_means_manager.predict(k_means_data))
        # plt.imshow(W, vmin=0, vmax=100)
        # plt.show()
        # plt.plot(k_means_data[:,0], k_means_data[:,1],'r.')
        # plt.plot(value[sort_idx], 'r.')
        # plt.show()
        # exit(0)

    def predict(self, data):
        ret = []
        # convert to spectral data
        for ii in range(data.shape[0]):
            distance, idx = self.kdtree.query(data[ii, :], k=1)
            ret.append(self.spectral_result[idx])
            # print(spec_data)
        return ret
Beispiel #10
0
                  [[12, 11], 'C'], [[3, 4], 'B'], [[10, 12], 'C'], [[6, 7],
                                                                    'A']]

qua3_test_data = [[12, 20], [8, 9], [13, 14], [7, 5], [9, 11]]


def get_data(x, y, r, num):
    data = []
    for i in range(num):
        theta = random.uniform(-2 * np.pi, 2 * np.pi)
        R = random.uniform(0, r)
        data.append([x + R * np.cos(theta), y + R * np.sin(theta)])
    return data


model = K_Means()
data = []
data += get_data(-10, 10, 8, 50)
data += (get_data(0, -5, 8, 50))
data += (get_data(10, 10, 8, 50))
#for i in range(num):
#    x = random.uniform(-20,20)
#    y = random.uniform(-20,20)
#    data.append([x,y])

model.train(data, k=3, metric='Euc')
clusters = []
for c in model.C:
    clusters.append(c)
ave = model.Ave
#print(clusters)
Beispiel #11
0
class SpectralCluster(object):
    # k是分组数
    def __init__(self, n_clusters=2):
        self.k_ = n_clusters
        self.kmeans = K_Means(n_clusters=n_clusters)

    def squared_exponential(self, x, y, sig=0.8, sig2=1):
        norm = np.linalg.norm(x - y)
        dist = norm * norm
        return np.exp(-dist / (2 * sig * sig2))

    def affinity(self, data):
        N = data.shape[0]
        sig = []
        ans = np.zeros((N, N))
        for i in range(N):
            dists = []
            for j in range(N):
                dis = np.linalg.norm(data[i, :] - data[j, :])
                dists.append(dis)
            dists.sort()
            sig.append(np.mean(dists[:5]))

        for i in range(N):
            for j in range(N):
                ans[i][j] = self.squared_exponential(data[i], data[j], sig[i],
                                                     sig[j])
        return ans

    def affinity_fast(self, data):
        N = data.shape[0]
        sig = []
        ans = np.zeros((N, N))
        dists = distance.cdist(data, data)

        dists.sort()
        sig = np.mean(dists[:, :5],
                      axis=1)  # neighour of 5 distances as variance

        for i in range(N):
            for j in range(N):
                ans[i][j] = self.squared_exponential(data[i], data[j], sig[i],
                                                     sig[j])

        return ans

    def get_laplacian_features(self, data):
        N = data.shape[0]
        W = self.affinity_fast(data)
        D_half_inv = np.zeros(W.shape)
        tmp = np.sum(W, axis=1)
        D_half_inv.flat[::len(tmp) + 1] = tmp**(-0.5)
        #import pdb; pdb.set_trace()
        L = D_half_inv.dot(W).dot(D_half_inv)  #graph laplacian

        w, v = scipy.sparse.linalg.eigs(L, self.k_)
        X = v.real
        rows_norm = np.linalg.norm(X, axis=1, ord=2)
        X = (X.T / rows_norm).T
        return X

    def fit(self, data):
        V = self.get_laplacian_features(data)
        self.kmeans.fit(V)

    def predict(self, data):
        V = self.get_laplacian_features(data)
        return self.kmeans.predict(V)
Beispiel #12
0
 def __init__(self, n_clusters=2):
     self.k_ = n_clusters
     self.kmeans = K_Means(n_clusters=n_clusters)