def __init__(self, publisher_node=None): super().__init__() self.data_6d = [] self.data_9d = [] self.k_means6 = k_means(data_6d) self.k_means9 = k_means(data_9d) self.pub_node = publisher_node self.judge_msg = 0
def imgWithKmeans(imgFile, k): rawData = mtimg.imread(imgFile) fig = plt.figure() ax1 = fig.add_subplot(121) ax1.set_title('origin img') ax2 = fig.add_subplot(122) ax2.set_title('k = {}'.format(k)) ax1.imshow(rawData) pixelArray = np.concatenate([i for i in rawData], axis=0) imgSize = rawData.shape pixelClass, _, _, clusterIndexs = k_means(pixelArray, k, False) infos = zip(pixelClass, clusterIndexs) for info in infos: pixel, indexs = info pixelArray[indexs] = pixel newImg = pixelArray.reshape(imgSize) ax2.imshow(newImg) plt.show()
def main(): N = 100 group_means = [(0, 0), (4, 5), (5, 0)] all_points = [] for group_x_coord, group_y_coord in group_means: group_point_x_coords = np.random.randn(N) + group_x_coord group_point_y_coords = np.random.randn(N) + group_y_coord group_points = zip(group_point_x_coords, group_point_y_coords) all_points.extend(group_points) group_point_lists = k_means.k_means(all_points, 3) axes = plt.subplots(2, 1, sharex=True, sharey=True)[1] x_coordinates, y_coordinates = zip(*all_points) axes[0].plot(x_coordinates, y_coordinates, ".") for group_points in group_point_lists: if len(group_points): x_coordinates, y_coordinates = zip(*group_points) axes[1].plot(x_coordinates, y_coordinates, ".", mew=0) plt.show()
def biKmeans(dataSet, k, distMeas=dist_eclud): m = dataSet.shape[0] clusterAssment = np.matrix(np.zeros((m, 2))) centroid0 = np.mean(dataSet, axis=0) centList = [centroid0] for j in range(m): clusterAssment[j, 1] = distMeas(np.matrix(centroid0), dataSet[j, :]) ** 2 while len(centList) < k: lowestSSE = np.inf for i in range(len(centList)): ptsInCurrCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == i)[0], :] centroidMat, splitClustAss = k_means(ptsInCurrCluster, 2, distMeas) sseSplit = sum(splitClustAss[:, 1]) sseNotSplit = sum(clusterAssment[np.nonzero(clusterAssment[:, 0].A != i)[0], 1]) print("sseSplit, and notSplit: ", sseSplit, sseNotSplit) if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit bestClustAss[np.nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(centList) bestClustAss[np.nonzero(bestClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit print('the bestCentToSplit is: ', bestCentToSplit) print('the len of bestClustAss is: ', len(bestClustAss)) centList[bestCentToSplit] = bestNewCents[0, :] centList.append(bestNewCents[1, :]) clusterAssment[np.nonzero(clusterAssment[:, 0].A == bestCentToSplit)[0], :] = bestClustAss return centList, clusterAssment
def main(): file_name = 'transfusion.csv' headers, points = read_csv(file_name) #points = km.gen_points(4) centroids = km.k_means(5, points, point_labels=False) print("The headers are {}".format(headers)) centroids_to_graph(centroids, headers, 0, 1)
def __init__(self, points, k, idx, phi): self.points = points self.k = k self.idx = idx self.phi = phi self.optCenters, self.optLabels = k_means.k_means(self.points, self.k) self.mdp = MeanDist(self.points, self.optCenters, self.optLabels) self.pars = [Partition(self.optCenters, self.optLabels)]
def my_cluster_my_kmeans(data, n_cluster=10): # using kmeans on raw data # @return cluster labels print("Begin my clustering on raw data...") print("Data shape = ", data.shape) start = time.time() labels, _, _, _ = k_means(data, n_clusters=n_cluster, max_iter=300) end = time.time() print("Clustering on raw data, using time = ", end - start) return labels
def main(): # test = k_means([[250,250],[1,1],[1,2],[2,1],[10,10],[240,240]], 2) # print(test) image = Image.open("a.jpg") data = __decompose(image) res = k_means(data,4) __compose(res[0], res[1], image) image.save("b.jpg") return 0
def elbow_method(data): sum_of_squared_distances = [] for k in range(1, 15): centers, clusters = k_means.k_means(data, k) ssd = 0 for i in range(k): ssd += np.sum(pairwise_distances(data[clusters == i], [centers[i]])) sum_of_squared_distances.append(ssd) return sum_of_squared_distances
def main(): # construct the clusters and plot them samples, c = 800, 5 data = km.generate_random_blobs(samples, c) km.plot_blobs(data) centers, clusters = km.k_means(data, c) km.plot_clusters(centers, clusters, data) # validate the choice of k with elbow and plot # the choice of sum_of_squared_distances = em.elbow_method(data) em.plot_elbow(sum_of_squared_distances)
def inertia_plot(n_runs, n_init_range, features, number_of_clusters, labels, label_names, random_seed): plt.figure() plots = [] legends = [] n_runs = n_runs n_init_range = n_init_range train_centers = None km_util = k_means(random_seed) cases = [(KMeans, 'k-means++', {}), (KMeans, 'random', {}), (KMeans, 'custom', {}), (MiniBatchKMeans, 'k-means++', { 'max_no_improvement': 3 }), (MiniBatchKMeans, 'random', { 'max_no_improvement': 3, 'init_size': 500 })] for factory, init, params in cases: print("Evaluation of %s with %s init" % (factory.__name__, init)) inertia = np.empty((len(n_init_range), n_runs)) for run_id in range(n_runs): for i, n_init in enumerate(n_init_range): if init == 'custom': custom_inertias = [] for j in range(n_init): centers = km_util.initialize_centers( features, labels, label_names) km = factory(n_clusters=number_of_clusters, init=centers, random_state=run_id, n_init=1, **params).fit(features) custom_inertias.append(km.inertia_) inertia[i, run_id] = min(custom_inertias) else: km = factory(n_clusters=number_of_clusters, init=init, random_state=run_id, n_init=n_init, **params).fit(features) inertia[i, run_id] = km.inertia_ p = plt.errorbar(n_init_range, inertia.mean(axis=1), inertia.std(axis=1)) plots.append(p[0]) legends.append("%s with %s init" % (factory.__name__, init)) plt.xlabel('n_init') plt.ylabel('inertia') plt.legend(plots, legends) plt.title("Mean inertia for various k-means init across %d runs" % n_runs) plt.show()
def compress_image(file, k): img = cv2.imread(file) Z = img.reshape((-1, 3)) Z = np.float32(Z) _, labels2, centroids2 = k_means(k, Z) centroids2 = np.uint8(centroids2) res = [centroids2[l] for l in labels2] res2 = np.array(res).reshape((img.shape)) cv2.imwrite('imagenes/perrito_K{0}.jpg'.format(k), res2)
def maintenance(self, node: QTreeNode): """ Maintain Quad Tree. Look up LARS maintenance for details. :param node: Node to be updated. """ if node.dirty(): # Have to use bool() because straight forward comparison would need another None check in else part because # of incorrect type hinting. Get your shit together python. if bool(node.children): if self.check_merge(node, node.children): node.elements = [ element for child in node.children for element in child.elements ] node.children.clear() node.dirty_size = 0 else: clusters, centroids = k_means(node.elements, 4) nodes = [ QTreeNode(cluster, centroid) for cluster, centroid in zip(clusters, centroids) ] if self.check_split(node, nodes): node.addChildren(nodes) for child in node.children: self.maintenance(child) node.dirty_size = 0 if not bool(node.children): clusters, centroids = k_means(node.elements, 4) nodes = [ QTreeNode(cluster, centroid) for cluster, centroid in zip(clusters, centroids) ] if self.check_split(node, nodes): node.addChildren(nodes) for child in node.children: self.maintenance(child) node.dirty_size = 0
def compare_to_sklearn(): n_dim = 2 n_clusters = 3 n_samples = 5 random_state = check_random_state(0) X = np.ndarray([n_clusters*n_samples, n_dim]) initial_centers = random_state.rand(n_clusters, n_dim)*10 for i in range(0, n_clusters): x_current = random_state.multivariate_normal(initial_centers[i, :], [[1, 0], [0, 1]], n_samples) X[i*n_samples:(i+1)*n_samples, :] = x_current closest_center, initial_centers = k_means.k_means(X, n_clusters) k_means_scikit_learn = sklearn.cluster.KMeans(n_clusters=n_clusters, random_state=random_state) closest_center_sklearn = k_means_scikit_learn.fit_predict(X) _swap_values_in_ndarray(closest_center_sklearn, 2, 0) numpy.testing.assert_array_equal(closest_center, closest_center_sklearn)
def fit_transform(self, X, cluster_number, epochs): data_number, feature_number = X.shape self.__cluster_number = cluster_number model = k_means.k_means() y = model.fit_transform(X, cluster_number, 1, distance.euclidean_distance) classes = np.unique(y) self.__pis = np.zeros((1, cluster_number)) self.__means = np.zeros((cluster_number, feature_number)) self.__sigma = np.zeros( (cluster_number, feature_number, feature_number)) for i in range(cluster_number): self.__pis[:, i] = np.mean(y == classes[i]) indexes = np.flatnonzero(y == classes[i]) self.__means[i] = np.mean(X[indexes], axis=0) self.__sigma[i] = np.cov(X[indexes].T) for _ in range(epochs): y_probs = self.score(X) number_classes = np.sum(y_probs, axis=0, keepdims=True) for i in range(cluster_number): self.__means[i] = np.sum(y_probs[:, i].reshape((-1, 1)) * X, axis=0) / number_classes[:, i] diff1 = (X - self.__means[i])[:, :, np.newaxis] diff2 = np.transpose( diff1, axes=(0, 2, 1)) * y_probs[:, i].reshape(-1, 1, 1) self.__sigma[i] = np.tensordot( diff1, diff2, axes=(0, 0)).reshape( (feature_number, feature_number)) / number_classes[:, i] ''' for j in range(data_number): diff = (X[j] - self.__means[i]).reshape(-1, 1) self.__sigma[i] += y_probs[j, i] * diff.dot(diff.T) self.__sigma[i] /= number_classes[:, i] ''' self.__pis = number_classes / data_number return np.argmax(y_probs, axis=1)
def launch_k_means(X): clusters = [] for k in n_clusters: print(f"{k} clusters") # 'Compress' image using K-means _, clustered = k_means(X, k=k, max_iterations=max_iterations, launch_count=launch_count) clusters.append(clustered) # Save the result for this k to file # clustered_compressed = np.array(clustered).astype(np.uint8) # np.savetxt(f"{out_data_path}clustered_{k}.txt", X=clustered_compressed, delimiter='\t', fmt='%1d') print("Done clustering.") return np.array(clusters)
def compare_to_sklearn(): n_dim = 2 n_clusters = 3 n_samples = 5 random_state = check_random_state(0) X = np.ndarray([n_clusters * n_samples, n_dim]) initial_centers = random_state.rand(n_clusters, n_dim) * 10 for i in range(0, n_clusters): x_current = random_state.multivariate_normal(initial_centers[i, :], [[1, 0], [0, 1]], n_samples) X[i * n_samples:(i + 1) * n_samples, :] = x_current closest_center, initial_centers = k_means.k_means(X, n_clusters) k_means_scikit_learn = sklearn.cluster.KMeans(n_clusters=n_clusters, random_state=random_state) closest_center_sklearn = k_means_scikit_learn.fit_predict(X) _swap_values_in_ndarray(closest_center_sklearn, 2, 0) numpy.testing.assert_array_equal(closest_center, closest_center_sklearn)
def to_quad_tree(self, elements: List[Tuple[int, Tuple[float, float]]]): """ Convert given data to a QuadTree :param elements: List of user id and user location coordinate tuple """ break_now = True # Initialize root rep_lat = 0 rep_lon = 0 for element in elements: rep_lat = element[1][0] rep_lon = element[1][1] rep_lat = rep_lat / max(len(elements), 1) rep_lon = rep_lon / max(len(elements), 1) self.root = QTreeNode(elements, (rep_lat, rep_lon)) work_level = [self.root] level_no = 1 while True: print("Building level", level_no) level_no += 1 new_level = [] # Split each node at current height level level_built_pct = 0 for node in work_level: if len(node.elements) >= 4: clusters, centroids = k_means(node.elements, 4) nodes = [ QTreeNode(cluster, centroid) for cluster, centroid in zip(clusters, centroids) ] if self.check_split(node, nodes): break_now = False new_level.extend(nodes) node.addChildren(nodes) level_built_pct += 100 / len(work_level) print("Level built {0}%".format(level_built_pct)) # No node was split. Algorithm converges. if break_now: break work_level = new_level break_now = True
def __init__(self, center, labels, k_means_flag, terminal, data, k): ''' 构造函数 center :预先设置的中心向量 labels :预先设置的标签 k_means_flag:是否使用k_means作为初始化的标识 terminal :迭代次数 data: :数据 k :聚类目标类别数 ''' self.data = data self.k = k self.terminal = terminal if k_means_flag: func = k_means(data, k) self.labels, self.center = func.k_means() else: self.labels = labels self.center = center
def test(): data_set = [] data_set.append(np.array([1.2, 2.3, 3.14])) data_set.append(np.array([1.3, 3.01, 4.0])) data_set.append(np.array([1.4, 3.1, 3.22])) data_set.append(np.array([1.5, 2.3, 3.64])) data_set.append(np.array([1.6, 2.33, 4.2])) data_set.append(np.array([1.7, 2.7, 3.76])) data_set.append(np.array([1.8, 2.3, 3.72])) data_set.append(np.array([1.88, 3.3, 4.1])) data_set.append(np.array([1.9, 3.0, 3.95])) data_set.append(np.array([7.2, 8.1, 19.99])) data_set.append(np.array([7, 8.24, 20])) data_set.append(np.array([7.54, 7.9, 18.5])) data_set.append(np.array([7.25, 8.1, 19.8])) data_set.append(np.array([7, 8.24, 20])) data_set.append(np.array([7.77, 7.6, 18.6])) data_set.append(np.array([7.66, 8.2, 19.6])) data_set.append(np.array([7, 8.24, 20])) data_set.append(np.array([7.52, 7.8, 18.2])) data_set.append(np.array([11.8, 21.7, -34.1])) data_set.append(np.array([12.1, 20.7, -33.6])) data_set.append(np.array([10.4, 19.4, -33.8])) data_set.append(np.array([11.9, 22.5, -34.2])) data_set.append(np.array([12.1, 20.7, -33.6])) data_set.append(np.array([10.9, 19.8, -32.9])) data_set.append(np.array([11.2, 22.0, -34.3])) data_set.append(np.array([12.2, 20.9, -43.0])) data_set.append(np.array([10.5, 19.9, -23.5])) C = k_means.k_means(3, dist, *data_set) # output for cluster in C: for vec in cluster: print(vec) mean_vec = reduce(lambda x, y: x + y, cluster) / len(cluster) print("The mean vector is:", mean_vec) print()
def main(): # Read input image image = np.array(Image.open(input_image_file)) X = image.reshape((image.shape[0] * image.shape[1], image.shape[2])) for k in n_clusters: print(f"{k} clusters") # 'Compress' image using K-means centroids, clustered = k_means(X, k=k, max_iterations=max_iterations, launch_count=launch_count) new_X = np.array( [centroids[cluster_index] for cluster_index in clustered]) new_X = new_X.astype(np.uint8) # Write output image new_image = new_X.reshape(image.shape) output_image_name = f"{output_image_prefix}_{k}.jpg" Image.fromarray(new_image).save(output_image_name) print(f"Saved {output_image_name}") print("Done.")
def launch_k_means(): image = np.array(Image.open(image_name)) X = image.reshape((image.shape[0] * image.shape[1], image.shape[2])) for k in n_clusters: print(f"{k} clusters") # 'Compress' image using K-means centroids, clustered = k_means(X, k=k, max_iterations=max_iterations, launch_count=launch_count) # Save the result for this k to file np.savetxt(f"{data_path}centroids_{k}.txt", X=centroids, delimiter='\t') clustered_compressed = np.array(clustered).astype(np.uint8) np.savetxt(f"{data_path}clustered_{k}.txt", X=clustered_compressed, delimiter='\t', fmt='%1d') print("Done clustering.")
def divide_list_of_children_by_k_means(k, point_list, list_of_all_children): children_coordinates = [] divide_list_of_children = [] temp_list_of_children = [] for i in point_list: children_coordinates.append(list(i)) children_coordinates.remove([0, 0]) clusters, means = k_means(children_coordinates, k) for i in clusters: for j in i: x1, y1 = j index = point_list.index((x1, y1)) temp = list_of_all_children[index] temp_list_of_children.append(temp) new_list = temp_list_of_children.copy() divide_list_of_children.append(new_list) temp_list_of_children.clear() return divide_list_of_children, means, clusters
import matplotlib.pyplot as plt import k_means as km if __name__ == "__main__": save_path = 'toy_data.tsv' data = pd.read_csv(save_path,delimiter = '\t', header = None) labels = data[0].to_numpy() labels = np.reshape(labels,(np.size(labels), 1)) x = data[[1,2]].to_numpy() # x1 = np.reshape(x[:,0],(np.size(labels), 1)) # x2 = np.reshape(x[:,1],(np.size(labels), 1)) # print(x[:,0]) ones = np.ones((len(data),1)) A = np.concatenate((ones,x), 1) theta = ls.least_squares(A, labels) theta_0 = theta[0] theta = np.delete(theta, 0, axis = 0) print(theta, theta_0) # plt.scatter(x[0:100,0], x[0:100,1]) # plt.scatter(x[100:,0], x[100:,1]) # axes = plt.gca() # x_vals = np.array(axes.get_xlim()) # y_vals = theta_0 + (-1*theta[0]) * x_vals # plt.plot(x_vals, y_vals, '--') # plt.show() # z = np.dot(theta, x1[120]) + theta_0 # print(z) theta_numpy, res, rank, s = np.linalg.lstsq(A, labels, rcond= None) print(theta_numpy) km.k_means(x, labels)
def setUp(self): self.centers = k_means(C)
P.append(ps[j]) if P != []: hull = ConvexHull(P) x = [P[l][0] for l in hull.vertices] y = [P[l][1] for l in hull.vertices] orig_len = len(x) x = x[-3:-1] + x + x[1:3] y = y[-3:-1] + y + y[1:3] t = np.arange(len(x)) ti = np.linspace(2, orig_len + 1, 10 * orig_len) xi = interp1d(t, x, kind='cubic')(ti) yi = interp1d(t, y, kind='cubic')(ti) plt.fill(xi, yi, alpha=0.2) if __name__ == '__main__': ps = np.random.rand(300,2) plt.subplot(211) plot(ps) draw_regs(ps,km.k_means(ps,k),0) plt.xlim(-0.1, 1.1) plt.ylim(-0.1, 1.1) plt.subplot(212) plot(ps) draw_regs(ps,cm.c_means(ps,k,m),1) plt.xlim(-0.1, 1.1) plt.ylim(-0.1, 1.1) plt.show()
def two_points(): X = np.asarray([(0, 0), (0, 1)]) clusters, centers = k_means.k_means(X, 2) numpy.testing.assert_array_equal(clusters.tolist(), [0, 1]) numpy.testing.assert_array_equal(centers, X)
percetion_m_childTest = percetion_m_childTest.append( bootstrap.Bootstrap(data2 , loops).ix[0] ) percetion_m_childTest = percetion_m_childTest.reset_index(drop= True) percetion_m_childTest.columns = ['mux', 'muy', 'sigmax', 'sigmay'] # print(percetion_m_childTest) percetion_m_childTest.to_excel('3.xlsx') s3 = [np.mean(percetion_m_childTest['mux']), np.mean(percetion_m_childTest['muy']), np.mean(percetion_m_childTest['sigmax']), np.mean(percetion_m_childTest['sigmay'])] print('所有的平均值,直接平均 :') print(s3 ) # 4 聚类 s4 = k_means.k_means(percetion_m_childTest) s4 = pd.DataFrame(s4) print('k_means聚类后 :') print(s4) # 5 KNN # s5 = KNN.KNN(percetion_m_childTest) # print('s5 ,KNN:',s5) # 6误差平方和 aa = [100,120,10,15] D =[] for i in range(4): d = ( aa[i] - s3[i] ) *( aa[i] - s3[i] ) D.append(d)
except FileNotFoundError: print( "ERROR: File", input_filename, "does not exist, please input the filename WITHOUT .ppm or check your directory" ) print() file_found = False while True: k_num = input("Enter number of colors to filter (whole numbers only):") if k_num.isnumeric(): if '.' not in k_num: break else: print("ERROR: Please enter an integer") else: print("ERROR: Please enter an integer") out_filename = input("Enter output image name (w/o .ppm):") new_image = k_means(image, int(k_num)) save_ppm(out_filename + ".ppm", new_image) if os.path.isfile(os.getcwd() + "/" + out_filename + ".ppm"): print("Image saved in", os.getcwd() + "/" + out_filename, end=".ppm\n") elif os.path.isfile(out_filename + ".ppm"): print("Image saved in", out_filename, end=".ppm\n") else: print("ERROR: Image process error, please restart the program")
# Тестовая выборка TESTING_DATA = [ (1.3, 4, 27, 1, 1, 1), (1.1, 6, 30, 10, 0, 0), (1.0, 4, 22, 11, 1, 1), (0.8, 10, 32, 1, 1, 0), (0.8, 10, 32, 6, 1, 1), (3.3, 5, 32, 15, 1, 0), ] # Выделение кластеров и присвоение им группы риска print "tuple has the form:" print "(auto cost, expierence, age, number of accidents, sex, married)" clusters = k_means(TRAINING_DATA, 3) lvls = {} for idx in range(len(clusters)): cluster = clusters[idx] print "%s)" % idx print cluster lvl = raw_input("Please, input danger level:") lvls[idx] = lvl # Тестирование и оценка for testing_data in TESTING_DATA: print "testing data:", testing_data idx = find_nearest_cluster(testing_data, clusters) print "Man has %s danger level" % lvls[idx]
def k_medians(points, k, initialization_method): if k <= 0 or len(points) <= k: return False # initialize k centers with zeroes k_centers = np.zeros((k, len(points[0])), dtype = np.float64) # initialization if initialization_method == FIRST_K_POINTS: print "FIRST_K_POINTS" k_centers = points[0:k] elif initialization_method == UNIFORMLY_K_POINTS: print "UNIFORMLY_K_POINTS" random_array = np.zeros(len(points), dtype = np.int) for i in range(random_array.size - 1): random_array[i + 1] = random_array[i] + 1 # permute to generate random array for i in range(random_array.size): j = random.randint(0, random_array.size - 1) e = random_array[i] random_array[i] = random_array[j] random_array[j] = e for i in range(len(k_centers)): k_centers[i] = points[random_array[i]] elif initialization_method == K_MEANS_PLUS_PLUS: print "K_MEANS_PLUS_PLUS" c0_index = random.randint(0, len(points) - 1) k_centers[0] = points[c0_index] distribution = np.zeros(len(points), dtype = np.float64) for r in range(1, len(k_centers)): for i in range(len(points)): nearest_center_index, nearest_distance = find_nearest_point(k_centers[0: r], points[i]) distribution[i] = nearest_distance ** 2 # normalization distribution sum_distances = np.sum(distribution) distribution /= sum_distances # accumulate distribution accumulate_distribution = np.zeros(len(distribution), dtype = np.float64) accumulate_distribution[0] = distribution[0] for i in range(1, len(distribution)): accumulate_distribution[i] = distribution[i] + accumulate_distribution[i - 1] random_number = random.random() for i in range(len(accumulate_distribution)): if random_number <= accumulate_distribution[i] and distribution[i] != 0: k_centers[r] = points[i] break elif initialization_method == GONZALES_ALGORITHM: print "GONZALES_ALGORITHM" # c0_index = random.randint(0, len(points) - 1) # k_centers[0] = points[c0_index] k_centers[0] = points[0] for t in range(1, len(k_centers)): nearest_center_index, cost_function = find_nearest_point(k_centers[0: t], points[0]) t_th_center_index = 0 for i in range(1, len(points)): nearest_center_index, nearest_distance = find_nearest_point(k_centers[0: t], points[i]) if nearest_distance > cost_function: t_th_center_index = i cost_function = nearest_distance k_centers[t] = points[t_th_center_index] elif initialization_method == K_MEANS_PLUS_PLUS_RESULT: k_centers, points_labels, k_means_cost_function_values = k_means.k_means(points, k, K_MEANS_PLUS_PLUS) else: return False # clustering # initialize k clusters, i.e., label array points_labels = np.zeros(len(points), dtype = np.int) k_medians_cost_function_values = [] while True: # assignment for i in range(len(points)): nearest_center_index, nearest_distance = find_nearest_point(k_centers, points[i]) points_labels[i] = nearest_center_index # compute k-means cost functions k_medians_cost_function_values.append(k_medians_cost_function(points, k_centers, points_labels)) # update new_k_centers = np.zeros((len(k_centers), len(points[0])), dtype = np.float64) k_clusters = [[] for i in range(len(new_k_centers))] for j in range(len(points_labels)): k_clusters[points_labels[j]].append(points[j]) # compute k-medians instead of k-means of each cluster # k-means # for i in range(len(new_k_centers)): # new_k_centers[i] = np.mean(np.array(k_clusters[i]), axis = 0) # k-medians for i in range(len(new_k_centers)): new_k_centers[i] = np.median(np.array(k_clusters[i]), axis = 0) if np.linalg.norm(np.linalg.norm(new_k_centers - k_centers, axis = 1)) <= 10.0 ** (-10): k_centers = new_k_centers k_medians_cost_function_values.append(k_medians_cost_function(points, k_centers, points_labels)) break else: k_centers = new_k_centers return k_centers, points_labels, k_medians_cost_function_values
# (requiere pillow en python3, PIL en python2) image = Image.open(imageName) pixels = image.load() # Se obtiene el tamaño de la imagen width, height = image.size # Se obtiene el nombre y la extension de la imagen name, extension = imageName.split(".") # Se inicializa una lista de pixeles pixelList = [[(0, 0, 0) for i in range(height)] for j in range(width)] # Se aplana la matriz de pixeles en la lista for i in range(width): for j in range(height): pixelList[i][j] = pixels[i, j] flattenedPixels = [i for sublist in pixelList for i in sublist] # Se corre k_means para cada K y se cambian los pixeles # luego se guarda la imagen. # (No se cambia la lista aplanada de pixeles). for i in [2, 4, 8, 16, 32, 64, 128]: clusters, mapping = k_means(i, flattenedPixels, compression=True) for w in range(width): for h in range(height): pixels[w, h] = clusters[mapping[w * height + h]] image.save("ImagenesComprimidas/" + name + "K" + str(i) + "." + extension)
# plt.plot(costs[:,0],costs[:,1]) # plt.xlabel("Number of Clusters") # plt.ylabel("Log Likelihood") # plt.title("MoG Clusters") # plt.grid() # plt.savefig('results/mog_2d_ksweep.pdf') # plt.close() ''' PART 2.2.4 ''' costs = [] for k in range(1,10): # vcost = train_mog_model(dataset2, k, validation2, 3)[-1] clusters = k_means(dataset2, k)[0] kvcost = get_cost(validation2, clusters) costs.append([k,0,kvcost]) # print "%d - %.3f - %.3f" %(k, vcost, kvcost) # costs = np.array(costs) # np.savetxt('results/costs100d.csv', costs, fmt="%d, %.3f, %.3f") # # plt.plot(costs[:,0],costs[:,1]) # plt.xlabel("Number of Clusters") # plt.ylabel("Log Likelihood") # plt.title("MoG Clusters") # plt.grid() # plt.savefig('results/mog_100d.pdf') # plt.close()
def four_points_3d(): # It doesn't make sense to use have more dimensions in ndarray here: it can be reshaped anyway to 2D X = np.asarray([(0, 1, 1), (0, 1, 2), (0, 1, 10), (0, 1, 11)]) clusters, centers = k_means.k_means(X, 2) numpy.testing.assert_array_equal(clusters.tolist(), [0, 0, 1, 1])
eig_vector = eig_vector.T req_eigen_vectors = [[0.0 for i in range(0, no_of_hidden_neurons)] for j in range(0, no_of_output_neurons)] req_eigen_vectors = np.array(req_eigen_vectors) # Sorting the eigen vectors using the eigen values for i in range(0, len(eig_value) - 1): for j in range(0, len(eig_value) - i - 1): if (eig_value[j] > eig_value[j + 1]): eig_value[j], eig_value[j + 1] = eig_value[j + 1], eig_value[j] eig_vector[j], eig_vector[j + 1] = eig_vector[j + 1], eig_vector[j] # Finding n0 smallest eigen values for i in range(0, no_of_output_neurons): req_eigen_vectors[i] = eig_vector[i] req_eigen_vectors[i] = np.divide( req_eigen_vectors[i], np.linalg.norm(np.dot(hidden_matrix, req_eigen_vectors[i].T))) hidden_matrix = np.array(hidden_matrix) req_eigen_vectors = np.array(req_eigen_vectors) output_matrix = np.dot(hidden_matrix, (req_eigen_vectors.T)) i = 0 print("Final Weights") print(req_eigen_vectors) k.k_means(output_matrix, no_of_clusters)
def separate_center(): X = np.asarray([(0, 1), (0, 2), (0, 3), (0, 10)]) clusters, centers = k_means.k_means(X, 2, np.asarray([(0, 2), (0, 10)])) numpy.testing.assert_array_equal(clusters.tolist(), [0, 0, 0, 1]) numpy.testing.assert_array_equal(centers.tolist(), [[0, 2], [0, 10]])
X[i*n_samples:(i+1)*n_samples, :] = x_current if not perform_profiling: plt.scatter(x_current[:, 0], x_current[:, 1], s=1000, c=plt.cm.viridis(clr), marker=markers[i % len(markers)]) if not perform_profiling: print("Points:") print(X) if perform_profiling: pr = cProfile.Profile() pr.enable() print('\nCalculating clusters\n') asked_for_n_clusters = 3 colors = np.linspace(0, 1, asked_for_n_clusters) closest_center, initial_centers = k_means.k_means(X, asked_for_n_clusters) if perform_profiling: pr.disable() s = io.StringIO() ps = pstats.Stats(pr, stream=s).sort_stats('cumulative') ps.print_stats() print(s.getvalue()) else: print("Closest centers:") print(closest_center) print("Initial centers:") print(initial_centers) print("Plotting the calculated clusters")
#get test data and apply scaler and pca test_ids = np.asarray(fe.get_test_dataset_song_ids()) test_features, test_genres = read_data( fe, test_ids) #read_echonest_data(fe, test_ids) test_features = scaler.transform(test_features) test_features = pca.transform(test_features) #number of clusters is number of genres number_of_clusters = get_number_of_clusters(fe) #get list of all gernes all_genre_names = fe.get_all_genres() ''' Step 2: K-means Clustering ''' km = k_means(RANDOM_SEED) #generating inertia plot of different initialization strategies n_run = 20 n_init_range = np.array([1, 5, 10, 15, 20]) inertia_plot(n_run, n_init_range, train_features, number_of_clusters, train_genres, all_genre_names, RANDOM_SEED) #initialize n_trials = 100 km_models = [] evaluation_values = [] ls_train_genres = [] ls_validatation_genres = [] ls_train_centers = []
import numpy as np import string from word_indexing import word_indexer from k_means import k_means data_path = "//home//sh//Desktop//june_project//data_quine//all_texts//1953e_On Mental Entities_Quine (1).txt" data_points = word_indexer(data_path) print(data_points) test1 = k_means(3, data_points) initialize = test1.cluster_centroid_initialization(2) print(initialize) clustering_step = test1.clustering(2, initialize) print(clustering_step) runit = test1.find_clusters(2) print(runit) cost = test1.distorsion_function(2, runit) print(cost)
def four_points_2d(): X = np.asarray([(0, 1), (0, 2), (0, 10), (0, 11)]) clusters, centers = k_means.k_means(X, 2) numpy.testing.assert_array_equal(clusters.tolist(), [0, 0, 1, 1])
def k_medians(points, k, initialization_method): if k <= 0 or len(points) <= k: return False # initialize k centers with zeroes k_centers = np.zeros((k, len(points[0])), dtype=np.float64) # initialization if initialization_method == FIRST_K_POINTS: print "FIRST_K_POINTS" k_centers = points[0:k] elif initialization_method == UNIFORMLY_K_POINTS: print "UNIFORMLY_K_POINTS" random_array = np.zeros(len(points), dtype=np.int) for i in range(random_array.size - 1): random_array[i + 1] = random_array[i] + 1 # permute to generate random array for i in range(random_array.size): j = random.randint(0, random_array.size - 1) e = random_array[i] random_array[i] = random_array[j] random_array[j] = e for i in range(len(k_centers)): k_centers[i] = points[random_array[i]] elif initialization_method == K_MEANS_PLUS_PLUS: print "K_MEANS_PLUS_PLUS" c0_index = random.randint(0, len(points) - 1) k_centers[0] = points[c0_index] distribution = np.zeros(len(points), dtype=np.float64) for r in range(1, len(k_centers)): for i in range(len(points)): nearest_center_index, nearest_distance = find_nearest_point( k_centers[0:r], points[i]) distribution[i] = nearest_distance**2 # normalization distribution sum_distances = np.sum(distribution) distribution /= sum_distances # accumulate distribution accumulate_distribution = np.zeros(len(distribution), dtype=np.float64) accumulate_distribution[0] = distribution[0] for i in range(1, len(distribution)): accumulate_distribution[ i] = distribution[i] + accumulate_distribution[i - 1] random_number = random.random() for i in range(len(accumulate_distribution)): if random_number <= accumulate_distribution[ i] and distribution[i] != 0: k_centers[r] = points[i] break elif initialization_method == GONZALES_ALGORITHM: print "GONZALES_ALGORITHM" # c0_index = random.randint(0, len(points) - 1) # k_centers[0] = points[c0_index] k_centers[0] = points[0] for t in range(1, len(k_centers)): nearest_center_index, cost_function = find_nearest_point( k_centers[0:t], points[0]) t_th_center_index = 0 for i in range(1, len(points)): nearest_center_index, nearest_distance = find_nearest_point( k_centers[0:t], points[i]) if nearest_distance > cost_function: t_th_center_index = i cost_function = nearest_distance k_centers[t] = points[t_th_center_index] elif initialization_method == K_MEANS_PLUS_PLUS_RESULT: k_centers, points_labels, k_means_cost_function_values = k_means.k_means( points, k, K_MEANS_PLUS_PLUS) else: return False # clustering # initialize k clusters, i.e., label array points_labels = np.zeros(len(points), dtype=np.int) k_medians_cost_function_values = [] while True: # assignment for i in range(len(points)): nearest_center_index, nearest_distance = find_nearest_point( k_centers, points[i]) points_labels[i] = nearest_center_index # compute k-means cost functions k_medians_cost_function_values.append( k_medians_cost_function(points, k_centers, points_labels)) # update new_k_centers = np.zeros((len(k_centers), len(points[0])), dtype=np.float64) k_clusters = [[] for i in range(len(new_k_centers))] for j in range(len(points_labels)): k_clusters[points_labels[j]].append(points[j]) # compute k-medians instead of k-means of each cluster # k-means # for i in range(len(new_k_centers)): # new_k_centers[i] = np.mean(np.array(k_clusters[i]), axis = 0) # k-medians for i in range(len(new_k_centers)): new_k_centers[i] = np.median(np.array(k_clusters[i]), axis=0) if np.linalg.norm(np.linalg.norm(new_k_centers - k_centers, axis=1)) <= 10.0**(-10): k_centers = new_k_centers k_medians_cost_function_values.append( k_medians_cost_function(points, k_centers, points_labels)) break else: k_centers = new_k_centers return k_centers, points_labels, k_medians_cost_function_values
def extract_features(pubs,authors): m = len(conferences) n = len(authors) X = np.zeros((m,n)) for p in pubs: aus = pubs[p]['authors'] venue = pubs[p]['venue'] for i in range(m): if conferences[i] in venue: for j in range(n): if authors[j] in aus: X[i][j] = X[i][j] + 1 # remove duplicate X[kdd_i] = X[kdd_i] - X[pakdd_i] - X[pkdd_i] X[sdm_i] = X[sdm_i] - X[wsdm_i] return X def member_print(mem): for m in mem: for i in range(len(m)): if m[i] == 1: print i+1 if __name__ == '__main__': pubs = load_data(TRAIN_FILE) top_aus = top_authors(pubs) X = extract_features(pubs,top_aus) member = km.k_means(X,4) #member_print(member) print "Purity is %f" %ca.purity(ground_truth_label,member) print "NMI is %f" %ca.nmi(ground_truth_label,member)
def test_k_means(self): result = k_means(self.points, self.k) print result
def covered(circle: tuple, r: float) -> int: return sum(dist(circle, point) <= r**2 for point in points) def add(u1: set, u2: set) -> int: return len(u2) - len(u2 & u1) N = int(input()) points = [tuple(map(int, input().split())) for i in range(N)] M = int(input()) radii = [int(input()) for i in range(M)] circle_order = sorted(range(M), key=lambda i: radii[i]) centers, ids, groups = k_means(M, points) def f(i): return -sum(dist(point, centers[i]) for point in groups[i]) center_order = sorted(range(M), key=f) active = set(points) circles = [None] * M for i in range(M): # print(i) t = kdTree(list(active)) rk, k = circle_order[i], center_order[i] candidates = [centers[k]] + groups[k]