def dist_mean(data_set_1, data_set_2): sum_dis = 0 for i in range(data_set_1.shape[0]): for j in range(data_set_2.shape[0]): sum_dis = sum_dis + hp.angDist( (data_set_1[i, :], data_set_2[j, :])) return sum_dis / (data_set_1.shape[0] * data_set_2.shape[0])
def dist_min(data_set_1, data_set_2): min_dis = 1 for i in range(data_set_1.shape[0]): for j in range(data_set_2.shape[0]): dis_ij = hp.angDist(data_set_1[i, :], data_set_2[j, :]) if dis_ij < min_dis: min_dis = dis_ij return min_dis
def generateCluster(data_set, clus_idx): clus_list = [] for i in range(data_set.shape[0]): temp = [] for j in range(len(clus_idx)): d = -1 * hp.angDist(data_set[i, :], data_set[clus_idx[j]]) temp.append(d) c = clus_idx[temp.index(np.max(temp))] clus_list.append(c) return clus_list
def findRemotestIndex(data_mat, idx_list): remotest_idx = 0 max_dist = 0 for i in range(data_mat.shape[0]): min_dist = np.inf for j in range(len(idx_list)): if i == idx_list[j]: min_dist = -np.inf continue cur_dist = hp.angDist(data_mat[i, :], data_mat[idx_list[j]]) if cur_dist < min_dist: min_dist = cur_dist if min_dist > max_dist: max_dist = min_dist remotest_idx = i return remotest_idx
def angKmeans(data_set, k, create_cent=randCenter, calc_mean=hp.orientationMean, max_iterate=50, min_error=0.0): copy_set = data_set.copy() m = copy_set.shape[0] cluster_condition = np.zeros((m, 2)) centroids = create_cent(copy_set, k) ini_centroids = centroids.copy() cluster_changed = True iterate_count = 0 old_error = 0 while cluster_changed: iterate_count = iterate_count + 1 cluster_changed = False for i in range(m): min_dist = np.inf min_index = -1 for j in range(k): dist_ji = hp.angDist(centroids[j, :], copy_set[i, :]) if dist_ji < min_dist: min_dist = dist_ji min_index = j if cluster_condition[i, 0] != min_index: cluster_changed = True cluster_condition[i, :] = min_index, min_dist ** 2 max_dist = -np.inf max_index = -1 for i in range(m): if cluster_condition[i, 1] > max_dist: max_dist = cluster_condition[i, 1] max_index = i for cent in range(k): pts_cluster = copy_set[np.nonzero(cluster_condition[:, 0] == cent)] if pts_cluster.shape[0] == 0: centroids[cent, :] = copy_set[max_index, :] print("empty slice happened") cluster_changed = True else: centroids[cent, :] = calc_mean(pts_cluster) if iterate_count >= max_iterate: cluster_changed = False if np.abs(np.sum(cluster_condition[:, 1]) / m - old_error) <= min_error: cluster_changed = False old_error = np.sum(cluster_condition[:, 1]) / m print("iterate round ", iterate_count, ", average error = ", np.sum(cluster_condition[:, 1]) / m) return ini_centroids, centroids, cluster_condition
def calcSimilarity(data_set, p_mode=0): similarity = np.mat(np.zeros((data_set.shape[0], data_set.shape[0]))) for i in range(data_set.shape[0]): temp = np.mat(np.zeros((1, data_set.shape[0]))) for j in range(data_set.shape[0]): s = -1 * hp.angDist(data_set[i, :], data_set[j, :]) temp[0, j] = s similarity[i, :] = temp[0, :] if p_mode == 0: p = np.median(similarity.A) elif p_mode == -1: p = np.min(similarity) elif p_mode == 1: p = np.max(similarity) else: p = np.median(similarity) for i in range(similarity.shape[0]): similarity[i, i] = p return similarity
def neighbour(data_set, data, eps): nb_set = data_set.copy() for i in range(data_set.shape[0] - 1, -1, -1): if hp.angDist(data_set[i, :], data) > eps: nb_set = np.delete(nb_set, i, axis=0) return nb_set