def kng(X, knn, way="gaussian", t="mean", Anchor=0, isSym=True): """ :param X: data matrix of n by d :param knn: the number of nearest neighbors :param way: one of ["gaussian", "t_free"] "t_free" denote the method proposed in : "The constrained laplacian rank algorithm for graph-based clustering" "gaussian" denote the heat kernel :param t: only needed by gaussian, the bandwidth parameter :param Anchor: Anchor set, m by d :return: A, an sparse matrix (graph) of n by n if Anchor = 0 (default) """ N, dim = X.shape if isinstance(Anchor, int): # n x n graph D = EuDist2(X, X, squared=True) ind_M = np.argsort(D, axis=1) if way == "gaussian": Val = matrix_index_take(D, ind_M[:, 1:(knn + 1)]) if t == "mean": t = np.mean(Val) elif t == "median": t = np.median(Val) Val = np.exp(-Val / t) elif way == "t_free": Val = matrix_index_take(D, ind_M[:, 1:(knn + 2)]) Val = Val[:, knn].reshape((-1, 1)) - Val[:, :knn] ind0 = np.where(Val[:, 0] == 0)[0] if len(ind0) > 0: Val[ind0, :] = 1 / knn Val = Val / np.sum(Val, axis=1).reshape(-1, 1) A = np.zeros((N, N)) matrix_index_assign(A, ind_M[:, 1:(knn + 1)], Val) if isSym: A = (A + A.T) / 2 else: # n x m graph num_anchor = Anchor.shape[0] D = EuDist2(X, Anchor, squared=True) # n x m ind_M = np.argsort(D, axis=1) if way == "gaussian": Val = matrix_index_take(D, ind_M[:, :knn]) if t == "mean": t = np.mean(Val) elif t == "median": t = np.median(Val) Val = np.exp(-Val / t) elif way == "t_free": Val = matrix_index_take(D, ind_M[:, :(knn + 1)]) Val = Val[:, knn].reshape((-1, 1)) - Val[:, :knn] Val = Val / np.sum(Val, axis=1).reshape(-1, 1) A = np.zeros((N, num_anchor)) matrix_index_assign(A, ind_M[:, :knn], Val) return A
def kng_anchor(X, Anchor: np.ndarray, knn=20, way="gaussian", t="mean", HSI=False, shape=None, alpha=0): """ see agci for more detail :param X: data matrix of n (a x b in HSI) by d :param Anchor: Anchor set, m by d :param knn: the number of nearest neighbors :param alpha: :param way: one of ["gaussian", "t_free"] "t_free" denote the method proposed in : "The constrained laplacian rank algorithm for graph-based clustering" "gaussian" denote the heat kernel :param t: only needed by gaussian, the bandwidth parameter :param HSI: compute similarity for HSI image :param shape: list, [a, b, c] image: a x b, c: channel :param alpha: parameter for HSI :return: A, a matrix (graph) of n by m """ if shape is None: shape = list([1, 1, 1]) N = X.shape[0] anchor_num = Anchor.shape[0] D = EuDist2(X, Anchor, squared=True) # n x m if HSI: # MeanData conv = np.ones((3, 3)) / 9 NData = X.reshape(shape) MeanData = np.zeros_like(NData) for i in range(shape[-1]): MeanData[:, :, i] = signal.convolve2d(NData[:, :, i], np.rot90(conv), mode='same') MeanData = MeanData.reshape(shape[0] * shape[1], shape[2]) D += EuDist2(MeanData, Anchor, squared=True) * alpha # n x m NN_full = np.argsort(D, axis=1) NN = NN_full[:, :knn] # xi isn't among neighbors of xi NN_k = NN_full[:, knn] Val = get_similarity_by_dist(D=D, NN=NN, NN_k=NN_k, knn=knn, way=way, t=t) A = np.zeros((N, anchor_num)) Ifuns.matrix_index_assign(A, NN, Val) return A
def get_anchor(X, m, way="random"): if way == "kmeans": A = KMeans(m, init='random').fit(X).cluster_centers_ elif way == "kmeans2": A = KMeans(m, init='random').fit(X).cluster_centers_ D = EuDist2(A, X) ind = np.argmin(D, axis=1) A = X[ind, :] elif way == "k-means++": A = KMeans(m, init='k-means++').fit(X).cluster_centers_ elif way == "k-means++2": A = KMeans(m, init='k-means++').fit(X).cluster_centers_ D = EuDist2(A, X) A = np.argmin(D, axis=1) elif way == "random": ids = random.sample(range(X.shape[0]), m) A = X[ids, :] return A
def knn_f(X, knn, squared=True): D_full = EuDist2(X, X, squared=squared) np.fill_diagonal(D_full, -1) NN_full = np.argsort(D_full, axis=1) np.fill_diagonal(D_full, 0) NN = NN_full[:, :knn] NND = matrix_index_take(D_full, NN) return NN, NND
def kng(X, knn, way="gaussian", t="mean", self=0, isSym=True): """ :param X: data matrix of n by d :param knn: the number of nearest neighbors :param way: one of ["gaussian", "t_free"] "t_free" denote the method proposed in : "The constrained laplacian rank algorithm for graph-based clustering" "gaussian" denote the heat kernel :param t: only needed by gaussian, the bandwidth parameter :param self: including self: weather xi is among the knn of xi :param isSym: True or False, isSym = True by default :return: A, a matrix (graph) of n by n """ N, dim = X.shape # n x n graph D = EuDist2(X, X, squared=True) np.fill_diagonal(D, -1) NN_full = np.argsort(D, axis=1) np.fill_diagonal(D, 0) if self == 1: NN = NN_full[:, :knn] # xi isn't among neighbors of xi NN_k = NN_full[:, knn] else: NN = NN_full[:, 1:(knn + 1)] # xi isn't among neighbors of xi NN_k = NN_full[:, knn + 1] # A = np.zeros((N, N)) # for i in range(N): # id = NN_full[i, 1 : knn + 2] # di = D[i, id] # A[i, id] = (di[knn] - di) / (knn * di[knn] - np.sum(di[:knn])); # Val = get_similarity_by_dist(D=D, NN=NN, NN_k=NN_k, knn=knn, way=way, t=t) A = np.zeros((N, N)) Ifuns.matrix_index_assign(A, NN, Val) if isSym: A = (A + A.T) / 2 return A
import numpy as np from sklearn.metrics.pairwise import euclidean_distances as EuDist2 import IDEAL_NPU.funs as Funs from IDEAL_NPU.cluster import PCN X, y_true, N, dim, c_true = Funs.load_Agg() D_full = EuDist2(X, X, squared=True) NN_full = np.argsort(D_full, axis=1) knn = 33 NN = NN_full[:, 1:(knn + 1)] NND = Funs.matrix_index_take(D_full, NN) for i in range(N): tmp_ind = np.lexsort((NN[i, :], NND[i, :])) NN[i, :] = NN[i, tmp_ind] print("begin") PCN_obj = PCN(NN, NND) y_pred = PCN_obj.cluster() t = PCN_obj.get_time() print("end", t) pre = Funs.precision(y_true=y_true, y_pred=y_pred) rec = Funs.recall(y_true=y_true, y_pred=y_pred) f1 = 2 * pre * rec / (pre + rec) print("{}".format(pre)) print("{}".format(f1))
if os.path.exists(e2_full_name): continue NN = np.fromfile(graph_full_name, dtype=np.int32) NN = NN.reshape(N, -1) if np.max(NN) >= N: print("Bad graph file, removed") os.system("rm {}".format(graph_full_name)) continue NN[:, 0] = np.array(range(N), dtype=np.int32) knn = NN.shape[1] NND = np.zeros((N, knn)) t1 = time.time() x_norm = np.sum(X**2, axis=1) for i in range(N): NND[i, :] = EuDist2(X[i, :].reshape(1, -1), X[NN[i, :], :], squared=True, X_norm_squared=x_norm[i:(i + 1)].reshape(1, -1), Y_norm_squared=x_norm[NN[i, :]]) # NND[i, :] = my.EuDist2(X[i, :].reshape(1, -1), X[NN[i, :], :], squared=True) t2 = time.time() - t1 print(t2) NN.astype(np.int32).tofile(graph_full_name) NND.astype(np.float64).tofile(e2_full_name)