def templateLengthProcessData(path_to_file, start_centers, expected_cluster_length, ccore, **kwargs): sample = read_sample(path_to_file) metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE)) itermax = kwargs.get('itermax', 200) kmeans_instance = kmeans(sample, start_centers, 0.001, ccore, metric=metric, itermax=itermax) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() wce = kmeans_instance.get_total_wce() if itermax == 0: assertion.eq(start_centers, centers) assertion.eq([], clusters) assertion.eq(0.0, wce) return obtained_cluster_sizes = [len(cluster) for cluster in clusters] assertion.eq(len(sample), sum(obtained_cluster_sizes)) assertion.eq(len(clusters), len(centers)) for center in centers: assertion.eq(len(sample[0]), len(center)) if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() assertion.eq(obtained_cluster_sizes, expected_cluster_length)
def cluster_kmeans(self, xs, ys): POI = [] for i in range(len(xs)): POI.append([xs[i], ys[i]]) POI = np.array(POI) rand = [] for i in range(14): r = randint(0, len(POI)) rand.append(POI[r]) kmeans_instance = kmeans(POI, rand, 4) kmeans_instance.process() clusters = kmeans_instance.get_clusters() if self.visualize: vis = cluster_visualizer() vis.append_clusters(clusters, POI) vis.show() ret = [] for i in range(len(clusters)): ret.append([]) for j in range(len(clusters[i])): ret[i].append(POI[clusters[i][j]]) return ret
def elbow_analysis(sample_file_path, kmin, kmax, **kwargs): initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(sample_file_path) elbow_instance = elbow(sample, kmin, kmax, initializer=initializer) elbow_instance.process() amount_clusters = elbow_instance.get_amount() wce = elbow_instance.get_wce() centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize() kmeans_instance = kmeans(sample, centers) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() print("Sample '%s': Obtained amount of clusters: '%d'." % (sample_file_path, amount_clusters)) figure = plt.figure(1) ax = figure.add_subplot(111) ax.plot(range(kmin, kmax), wce, color='b', marker='.') ax.plot(amount_clusters, wce[amount_clusters - kmin], color='r', marker='.', markersize=10) ax.annotate("Elbow", (amount_clusters + 0.1, wce[amount_clusters - kmin] + 5)) ax.grid(True) plt.ylabel("WCE") plt.xlabel("K") plt.show() kmeans_visualizer.show_clusters(sample, clusters, centers)
def gt_boxes_cluster(gt_boxes, centers=5): """ 聚类gt boxes长宽 :param gt_boxes: numpy数组 [n,(y1,x1,y2,x2)] :param centers: 聚类中心个数 :return: 聚类后的高度和宽度 """ height = gt_boxes[:, 2] - gt_boxes[:, 0] width = gt_boxes[:, 3] - gt_boxes[:, 1] hw = np.stack([height, width], axis=1) # 保存长宽数据 np.save('/tmp/gt_height_width.npy', hw) # Kmeans聚类 metric = distance_metric(type_metric.USER_DEFINED, func=iou_distance) init_centers = hw[np.random.choice(len(hw), centers, replace=False)] m = kmeans(hw, init_centers, metric=metric) m.process() cluster_centers = np.array(m.get_centers()) # 聚类 height = np.array([round(h, 2) for h in cluster_centers[:, 0]]) width = np.array([round(w, 2) for w in cluster_centers[:, 1]]) # 排序输出结果 sort_indices = np.argsort(height) height = height[sort_indices] width = width[sort_indices] return height, width
def __improve_parameters(self, centers, available_indexes = None): """! @brief Performs k-means clustering in the specified region. @param[in] centers (list): Centers of clusters. @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used. @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data. """ if available_indexes and len(available_indexes) == 1: index_center = available_indexes[0] return [ available_indexes ], self.__pointer_data[index_center] local_data = self.__pointer_data if available_indexes: local_data = [ self.__pointer_data[i] for i in available_indexes ] local_centers = centers if centers is None: local_centers = kmeans_plusplus_initializer(local_data, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize() kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False) kmeans_instance.process() local_centers = kmeans_instance.get_centers() clusters = kmeans_instance.get_clusters() if available_indexes: clusters = self.__local_to_global_clusters(clusters, available_indexes) return clusters, local_centers
def kmeansWithScores(filenameData, filenameSilhMean, filenameDBS, filenameCHS, kClusters): data = read_sample(str(root) + '\\' + filenameData) #kClusters = canoc(data, kmin, kmax) initial_centers = rci(data, kClusters).initialize() kmeans_instance = kmeans(data, initial_centers, metric=metricResearch) kmeans_instance.process() clusters = kmeans_instance.get_clusters() predicted = kmeans_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) witTXT(meanSilhouetteScore, filenameSilhMean, filepath=root, note='k: ' + str(kClusters)) dbsScore = dbs(data, predicted) witTXT(dbsScore, filenameDBS, filepath=root, note='k: ' + str(kClusters)) chsScore = chs(data, predicted) witTXT(chsScore, filenameCHS, filepath=root, note='k: ' + str(kClusters))
def __search_optimial_parameters(self, local_data): """! @brief Split data of the region into two cluster and tries to find global optimum by running k-means clustering several times (defined by 'repeat' argument). @param[in] local_data (list): Points of a region that should be split into two clusters. @return (tuple) List of allocated clusters, list of centers and total WCE (clusters, centers, wce). """ optimal_wce, optimal_centers, optimal_clusters = float( '+inf'), None, None for _ in range(self.__repeat): candidates = 5 if len(local_data) < candidates: candidates = len(local_data) local_centers = kmeans_plusplus_initializer( local_data, 2, candidates).initialize() kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False) kmeans_instance.process() local_wce = kmeans_instance.get_total_wce() if local_wce < optimal_wce: optimal_centers = kmeans_instance.get_centers() optimal_clusters = kmeans_instance.get_clusters() optimal_wce = local_wce return optimal_clusters, optimal_centers, optimal_wce
def _search_optimal_parameters(self, data, amount): """! @brief Performs cluster analysis for specified data several times to find optimal clustering result in line with WCE. @param[in] data (array_like): Input data that should be clustered. @param[in] amount (unit): Amount of clusters that should be allocated. @return (tuple) Optimal clustering result: (clusters, centers, wce). """ best_wce, best_clusters, best_centers = float('+inf'), [], [] for _ in range(self.__repeat): initial_centers = kmeans_plusplus_initializer( data, amount, random_state=self.__random_state).initialize() solver = kmeans(data, initial_centers, tolerance=self.__tolerance, ccore=False).process() candidate_wce = solver.get_total_wce() if candidate_wce < best_wce: best_wce = candidate_wce best_clusters = solver.get_clusters() best_centers = solver.get_centers() if len(initial_centers) == 1: break # No need to rerun clustering for one initial center. return best_clusters, best_centers, best_wce
def get_kmeans_clusters(data, count_centers): rows = data.getRows() input_data = list() result_clusters = list() for row in rows: input_data.append(row.getDataArray()) SST = calculate_sst(input_data) # initialize initial centers using K-Means++ method initial_centers = kmeans_plusplus_initializer( input_data, count_centers).initialize() # create instance of K-Means algorithm with prepared centers kmeans_instance = kmeans(input_data, initial_centers) # run cluster analysis and obtain results kmeans_instance.process() clusters = kmeans_instance.get_clusters() colorRange = Constants.DEFAULT_COLOR_SET SSB = 0 SSW = 0 for i, cluster in enumerate(clusters): result_cluster = Cluster( KMeansWindow.get_rows_kmeans(data, cluster)) ro = KMeansWindow.get_rows_kmeans(data, cluster) f = [x._dataArray for x in ro] SSW = SSW + calculate_ssw(f) colour = random.choice(colorRange) result_cluster.setName(colour) result_cluster.setColor(colour) result_clusters.append(result_cluster) SSB = calculate_ssb(SST, SSW) RS_RESULT.append(SSB / SST) print(RS_RESULT) return result_clusters
def subcluster(dataset): kmin = 1 kmax = 20 if kmax > len(dataset): kmax = len(dataset) optimal_clusters = 1 # Determining Clusters # Might potentially be inefficient technique # Instead of elbow, could again repeat what is done # in the main clustering, going through K values # Choosing one with lowest error from calcError # This could be very time intensive however if kmax - kmin <= 3: optimal_clusters = int((kmin + kmax) / 2) else: elbow_inst = elbow(dataset, kmin, kmax) elbow_inst.process() optimal_clusters = elbow_inst.get_amount() if optimal_clusters > len(dataset): optimal_clusters = len(dataset) initial_centers = kmeans_plusplus_initializer( dataset, optimal_clusters).initialize() metric = distance_metric(type_metric.EUCLIDEAN) kmeans_instance = kmeans(dataset, initial_centers, metric=metric) kmeans_instance.process() clusters = kmeans_instance.get_clusters() return clusters
def templateLengthProcessData(path_to_file, start_centers, expected_cluster_length, ccore, **kwargs): sample = read_sample(path_to_file) metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE)) kmeans_instance = kmeans(sample, start_centers, 0.025, ccore, metric=metric) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() obtained_cluster_sizes = [len(cluster) for cluster in clusters] assertion.eq(len(sample), sum(obtained_cluster_sizes)) assertion.eq(len(clusters), len(centers)) for center in centers: assertion.eq(len(sample[0]), len(center)) if expected_cluster_length != None: obtained_cluster_sizes.sort() expected_cluster_length.sort() assertion.eq(obtained_cluster_sizes, expected_cluster_length)
def kmeansWithScores(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax): data = read_sample(str(root)+'\\'+nameData) #initial_centers = kppi(data, k_clusters).initialize() initial_centers = rci(data, k_clusters).initialize() kmeans_instance = kmeans(data, initial_centers, metric = measure) kmeans_instance.process() clusters = kmeans_instance.get_clusters() predicted = kmeans_instance.predict(data) # final_centers = kmeans_instance.get_centers() silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) #wlitCSV(silhouetteScore, filenameSilhouette, '', root) #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root) dbsScore = dbs(data, predicted) #witCSV(dbsScore, nameDBS, '', root) chsScore = chs(data, predicted) #witCSV(chsScore, nameCHS, '', root) elbow_instance = elbow(data, kmin, kmax) elbow_instance.process() amount_clusters = elbow_instance.get_amount() wce = elbow_instance.get_wce()
def template_clustering(start_centers, path, tolerance=0.25, ccore=False): sample = read_sample(path) dimension = len(sample[0]) metric = distance_metric(type_metric.MANHATTAN) observer = kmeans_observer() kmeans_instance = kmeans(sample, start_centers, tolerance, ccore, observer=observer, metric=metric) (ticks, _) = timedcall(kmeans_instance.process) clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") visualizer = cluster_visualizer_multidim() visualizer.append_clusters(clusters, sample) visualizer.show() if dimension > 3: kmeans_visualizer.show_clusters(sample, clusters, centers, start_centers) kmeans_visualizer.animate_cluster_allocation(sample, observer)
def calculate_fitness(ind, df, X, target, func_set, operations, number_of_clusters): ind_exp = ind.unroll_expression([]) def fitness_distance(data1, data2): """ input: point1 e point2 = pontos utilizados no cálculo da distância output: result = distância entre os dois pontos """ result = eval(data1, data2, func_set, operations, ind_exp) return result # distance function fitness_metric = distance_metric(type_metric.USER_DEFINED, func=fitness_distance) k = number_of_clusters initial_centers = kmeans_plusplus_initializer(X, k).initialize() kmeans_instance = kmeans(X, initial_centers, metric=fitness_metric) kmeans_instance.process() clusters = kmeans_instance.get_clusters() for i in range(len(clusters)): df.loc[clusters[i], 'y_pred'] = i score = v_measure_score(target, df.y_pred) # reseting dataframe df = df.drop(['y_pred'], axis=1) return score
def process_kmeans(sample): instance = kmeans( sample, [[random() + (multiplier * 5), random() + (multiplier + 5)] for multiplier in range(NUMBER_CLUSTERS)]) (ticks, _) = timedcall(instance.process) return ticks
def __improve_parameters(self, centers, available_indexes=None): """! @brief Performs k-means clustering in the specified region. @param[in] centers (list): Cluster centers, if None then automatically generated two centers using center initialization method. @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None then all points are used. @return (tuple) List of allocated clusters, list of centers and total WCE. """ if available_indexes and len(available_indexes) == 1: index_center = available_indexes[0] return [available_indexes], self.__pointer_data[index_center], 0.0 local_data = self.__pointer_data if available_indexes: local_data = [self.__pointer_data[i] for i in available_indexes] local_centers = centers if centers is None: local_centers = kmeans_plusplus_initializer(local_data, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize() kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False) kmeans_instance.process() local_wce = kmeans_instance.get_total_wce() local_centers = kmeans_instance.get_centers() clusters = kmeans_instance.get_clusters() if available_indexes: clusters = self.__local_to_global_clusters(clusters, available_indexes) return clusters, local_centers, local_wce
def templateAnimateClusteringResultNoFailure(filename, initial_centers, ccore_flag): sample = read_sample(filename); observer = kmeans_observer(); kmeans_instance = kmeans(sample, initial_centers, 0.025, ccore_flag, observer=observer); kmeans_instance.process(); kmeans_visualizer.animate_cluster_allocation(sample, observer);
def template_segmentation_image(source, start_centers): data = read_image(source) kmeans_instance = kmeans(data, start_centers) kmeans_instance.process() clusters = kmeans_instance.get_clusters() draw_image_mask_segments(source, clusters)
def clustering_random_points(amount_points, amount_centers, ccore): sample = [ [ random.random(), random.random() ] for _ in range(amount_points) ] centers = [ [ random.random(), random.random() ] for _ in range(amount_centers) ] kmeans_instance = kmeans(sample, centers, 0.0001, ccore) (ticks, _) = timedcall(kmeans_instance.process) print("Execution time ("+ str(amount_points) +" 2D-points):", ticks)
def templateAnimateClusteringResultNoFailure(filename, initial_centers, ccore_flag): sample = read_sample(filename) observer = kmeans_observer() kmeans_instance = kmeans(sample, initial_centers, 0.025, ccore_flag, observer=observer) kmeans_instance.process() kmeans_visualizer.animate_cluster_allocation(sample, observer)
def template_segmentation_image(source, start_centers): data = read_image(source); kmeans_instance = kmeans(data, start_centers); kmeans_instance.process(); clusters = kmeans_instance.get_clusters(); draw_image_mask_segments(source, clusters);
def testDrawSegmentationResultNoFailure(self): data = utils.read_image(IMAGE_SIMPLE_SAMPLES.IMAGE_SIMPLE01); kmeans_instance = kmeans(data, [[255, 0, 0], [0, 0, 255], [180, 136, 0], [255, 255, 255]]); kmeans_instance.process(); clusters = kmeans_instance.get_clusters(); utils.draw_image_mask_segments(IMAGE_SIMPLE_SAMPLES.IMAGE_SIMPLE01, clusters); utils.draw_image_color_segments(IMAGE_SIMPLE_SAMPLES.IMAGE_SIMPLE01, clusters);
def clusterData(data, numberOfConversations, getData): initial_centers = kmeans_plusplus_initializer(data, numberOfConversations).initialize() instance = kmeans(data, initial_centers) instance.process() #kmeans_visualizer.show_clusters(data, instance.get_clusters(), instance.get_centers(), initial_centers) if getData: # returns list that specifies which messages belong to which conversation return instance.get_clusters() else: # returns time list of when each conversation occured return instance.get_centers()
def templateShowClusteringResultNoFailure(filename, initial_centers, ccore_flag): sample = read_sample(filename); kmeans_instance = kmeans(sample, initial_centers, 0.025, ccore_flag); kmeans_instance.process(); clusters = kmeans_instance.get_clusters(); centers = kmeans_instance.get_centers(); kmeans_visualizer.show_clusters(sample, clusters, centers, initial_centers);
def kmeansRun(sample, k, specMetric): initial_centers = rci(sample, k).initialize() kmeans_instance = kmeans(sample, initial_centers, metric=distance_metric(specMetric)) kmeans_instance.process() clusters = kmeans_instance.get_clusters() predicted = kmeans_instance.predict(sample) return (clusters, predicted)
def template_clustering(start_centers, path, tolerance = 0.25, ccore = True): sample = read_sample(path); kmeans_instance = kmeans(sample, start_centers, tolerance, ccore); (ticks, result) = timedcall(kmeans_instance.process); clusters = kmeans_instance.get_clusters(); print("Sample: ", path, "\t\tExecution time: ", ticks, "\n"); draw_clusters(sample, clusters);
def template_clustering(start_centers, path, tolerance=0.25, ccore=True): sample = read_sample(path) kmeans_instance = kmeans(sample, start_centers, tolerance, ccore) (ticks, result) = timedcall(kmeans_instance.process) clusters = kmeans_instance.get_clusters() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") draw_clusters(sample, clusters)
def templateShowClusteringResultNoFailure(filename, initial_centers, ccore_flag): sample = read_sample(filename) kmeans_instance = kmeans(sample, initial_centers, 0.025, ccore_flag) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() kmeans_visualizer.show_clusters(sample, clusters, centers, initial_centers)
def templateClusterAllocationOneDimensionData(ccore_flag): input_data = [ [random()] for _ in range(10) ] + [ [random() + 3] for _ in range(10) ] + [ [random() + 5] for _ in range(10) ] + [ [random() + 8] for _ in range(10) ]; kmeans_instance = kmeans(input_data, [ [0.0], [3.0], [5.0], [8.0] ], 0.025, ccore_flag); kmeans_instance.process(); clusters = kmeans_instance.get_clusters(); assertion.eq(4, len(clusters)); for cluster in clusters: assertion.eq(10, len(cluster));
def templateClusterAllocationOneDimensionData(self, ccore_flag): input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ]; kmeans_instance = kmeans(input_data, [ [0.0], [3.0], [5.0], [8.0] ], 0.025, ccore_flag); kmeans_instance.process(); clusters = kmeans_instance.get_clusters(); assert len(clusters) == 4; for cluster in clusters: assert len(cluster) == 10;
def templateClusterAllocationOneDimensionData(ccore_flag): input_data = [ [random()] for _ in range(10) ] + [ [random() + 3] for _ in range(10) ] + [ [random() + 5] for _ in range(10) ] + [ [random() + 8] for _ in range(10) ] kmeans_instance = kmeans(input_data, [ [0.0], [3.0], [5.0], [8.0] ], 0.025, ccore_flag) kmeans_instance.process() clusters = kmeans_instance.get_clusters() assertion.eq(4, len(clusters)) for cluster in clusters: assertion.eq(10, len(cluster))
def get_modelo(self, algoritmo, eps, neig): print(algoritmo + ' ' + str(eps) + ' - ' + str(neig)) instance = None if algoritmo == 'AGNES': instance = agglomerative(self.amostras, self.numero_clusters, link=None) elif algoritmo == 'BIRCH': instance = birch(self.amostras, self.numero_clusters, entry_size_limit=10000) elif algoritmo == 'CLARANS': instance = clarans(self.amostras, self.numero_clusters, numlocal=100, maxneighbor=1) elif algoritmo == 'CURE': instance = cure(self.amostras, self.numero_clusters, number_represent_points=5, compression=0.5) elif algoritmo == 'DBSCAN': instance = dbscan(self.amostras, eps=eps, neighbors=neig) elif algoritmo == 'FCM': initial_centers = kmeans_plusplus_initializer( self.amostras, self.numero_clusters).initialize() instance = fcm(self.amostras, initial_centers) elif algoritmo == 'KMEANS': initial_centers = kmeans_plusplus_initializer( self.amostras, self.numero_clusters).initialize() instance = kmeans(self.amostras, initial_centers, tolerance=0.001) elif algoritmo == 'KMEDOIDS': instance = kmedoids(self.amostras, initial_index_medoids=[0, 0, 0, 0, 0, 0, 0], tolerance=0.0001) #ajustar o n_de cluster elif algoritmo == 'OPTICS': instance = optics(self.amostras, eps=eps, minpts=neig) elif algoritmo == 'ROCK': instance = rock(self.amostras, eps=eps, number_clusters=self.numero_clusters, threshold=0.5) else: pass instance.process() lista_agrupada = self.get_lista_agrupada(instance.get_clusters()) lista_agrupada = np.array(lista_agrupada) if (neig != 0): n_grupos = len(np.unique(lista_agrupada)) if n_grupos > self.numero_clusters: lista_agrupada = self.get_modelo(algoritmo, eps, neig + 1) return lista_agrupada
def _perform_clustering(self): """! @brief Performs cluster analysis using K-Means algorithm using current centers are initial. @param[in] data (array_like): Input data for cluster analysis. """ solver = kmeans(self.__data, self.__centers, tolerance=self.__tolerance, ccore=False).process() self.__clusters = solver.get_clusters() self.__centers = solver.get_centers() self.__total_wce = solver.get_total_wce()
def templateDrawClustersNoFailure(self, data_path, amount_clusters): sample = read_sample(data_path); initial_centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize(); kmeans_instance = kmeans(sample, initial_centers, amount_clusters); kmeans_instance.process(); clusters = kmeans_instance.get_clusters(); ax = draw_clusters(sample, clusters); assert None != ax;
def model_train_euclidian(): # define número de clusters k = 3 # Inicializa centróides utilizando método K-Means++ initial_centers = kmeans_plusplus_initializer(X, k).initialize() # cria instância do K-Means utilizando distância Euclidiana kmeans_instance = kmeans(X, initial_centers) # run cluster analysis and obtain results kmeans_instance.process() # recupera os clusters gerados clusters = kmeans_instance.get_clusters()
def template_segmentation_image_amount_colors(source, amount): data = read_image(source) centers = kmeans_plusplus_initializer( data, amount, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize() kmeans_instance = kmeans(data, centers) kmeans_instance.process() clusters = kmeans_instance.get_clusters() draw_image_mask_segments(source, clusters)
def elbow_k_means(key_word, model_path): logger = Logger(model_path) model = logger.model result = model.most_similar(key_word, topn=100) word_vectors = [] num_clusters = 8 word_names = [] word_correlation = [] for r in result: word_vectors.append(model.wv[r[0]]) word_names.append(r[0]) word_correlation.append(r[1]) tsne = PCA(n_components=2) X_tsne = tsne.fit_transform(word_vectors) kmin, kmax = 1, 10 elbow_instance = elbow(X_tsne, kmin, kmax) elbow_instance.process() amount_clusters = elbow_instance.get_amount() wce = elbow_instance.get_wce() centers = kmeans_plusplus_initializer(X_tsne, amount_clusters, amount_candidates=kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize() k_means_instance = kmeans(X_tsne, centers) k_means_instance.process() clusters = k_means_instance.get_clusters() centers = k_means_instance.get_centers() index_to_word = [[] for i in range(len(clusters))] index_to_correlation = [[] for i in range(len(clusters))] idx = 0 cluster_list = [] for c in clusters: words_list = [] for i in c: word_dict = dict() word_dict["text"] = word_names[i] word_dict["correlation"] = word_correlation[i] t_dict = dict() t_dict["word"] = word_dict words_list.append(t_dict) words_dict = dict() words_dict["words"] = words_list cluster_list.append(words_dict) idx += 1 return len(clusters), cluster_list
def templateLengthProcessData(self, path_to_file, start_centers, expected_cluster_length, ccore = False): sample = read_sample(path_to_file); kmeans_instance = kmeans(sample, start_centers, 0.025, ccore); kmeans_instance.process(); clusters = kmeans_instance.get_clusters(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; assert len(sample) == sum(obtained_cluster_sizes); obtained_cluster_sizes.sort(); expected_cluster_length.sort(); assert obtained_cluster_sizes == expected_cluster_length;
def templateCollectEvolution(filename, initial_centers, number_clusters, ccore_flag): sample = read_sample(filename) observer = kmeans_observer() kmeans_instance = kmeans(sample, initial_centers, 0.025, ccore_flag, observer=observer) kmeans_instance.process() assertion.le(1, len(observer)) for i in range(len(observer)): assertion.le(1, len(observer.get_centers(i))) for center in observer.get_centers(i): assertion.eq(len(sample[0]), len(center)) assertion.le(1, len(observer.get_clusters(i)))
def __process_by_python(self): """! @brief Performs processing using python implementation. """ for amount in range(self.__kmin, self.__kmax): centers = self.__initializer(self.__data, amount).initialize() instance = kmeans(self.__data, centers, ccore=True) instance.process() self.__wce.append(instance.get_total_wce()) self.__calculate_elbows() self.__find_optimal_kvalue()
def templateEncoderProcedures(filename, initial_centers, number_clusters, ccore_flag): sample = read_sample(filename) kmeans_instance = kmeans(sample, initial_centers, 0.025, ccore_flag) kmeans_instance.process() clusters = kmeans_instance.get_clusters() encoding = kmeans_instance.get_cluster_encoding() encoder = cluster_encoder(encoding, clusters, sample) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION) assertion.eq(number_clusters, len(clusters))
def template_clustering(start_centers, path, tolerance=0.25, ccore=True): sample = read_sample(path) kmeans_instance = kmeans(sample, start_centers, tolerance, ccore) (ticks, result) = timedcall(kmeans_instance.process) clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) visualizer.append_cluster(start_centers, marker="*", markersize=20) visualizer.append_cluster(centers, marker="*", markersize=20) visualizer.show()
def find_optimal_amout_clusters(sample_path, kmin, kmax, algorithm): sample = read_sample(sample_path) search_instance = silhouette_ksearch(sample, kmin, kmax, algorithm=algorithm).process() amount = search_instance.get_amount() scores = search_instance.get_scores() print("Sample: '%s', Scores: '%s'" % (sample_path, str(scores))) initial_centers = kmeans_plusplus_initializer(sample, amount).initialize() kmeans_instance = kmeans(sample, initial_centers).process() clusters = kmeans_instance.get_clusters() visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) visualizer.show()
def __initialize_kmeans(self): initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize() kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True) kmeans_instance.process() means = kmeans_instance.get_centers() covariances = [] initial_clusters = kmeans_instance.get_clusters() for initial_cluster in initial_clusters: if len(initial_cluster) > 1: cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ] covariances.append(numpy.cov(cluster_sample, rowvar = False)) else: dimension = len(self.__sample[0]) covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0) return means, covariances
def template_clustering(start_centers, path, tolerance = 0.25, ccore = False): sample = read_sample(path) dimension = len(sample[0]) metric = distance_metric(type_metric.MANHATTAN) observer = kmeans_observer() kmeans_instance = kmeans(sample, start_centers, tolerance, ccore, observer=observer, metric=metric) (ticks, _) = timedcall(kmeans_instance.process) clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") visualizer = cluster_visualizer_multidim() visualizer.append_clusters(clusters, sample) visualizer.show() if dimension > 3: kmeans_visualizer.show_clusters(sample, clusters, centers, start_centers) kmeans_visualizer.animate_cluster_allocation(sample, observer)
def testDifferentDimensions(self): kmeans_instance = kmeans([ [0, 1, 5], [0, 2, 3] ], [ [0, 3] ]); self.assertRaises(NameError, kmeans_instance.process);
def testCoreInterfaceIntInputData(self): kmeans_instance = kmeans([ [1], [2], [3], [20], [21], [22] ], [ [2], [21] ], 0.025, True) kmeans_instance.process() assert len(kmeans_instance.get_clusters()) == 2
def process_kmeans(sample): instance = kmeans(sample, [ [random() + (multiplier * 5), random() + (multiplier + 5)] for multiplier in range(NUMBER_CLUSTERS) ]) (ticks, _) = timedcall(instance.process) return ticks
def testDifferentDimensions(self): kmeans_instance = kmeans([ [0, 1, 5], [0, 2, 3] ], [ [0, 3] ], ccore=False) self.assertRaises(ValueError, kmeans_instance.process)