def template_compare_output(self, path, k, candidates, random_state, metric): sample = read_sample(path) matrix = calculate_distance_matrix(sample, metric=metric) result1 = kmeans_plusplus_initializer(sample, k, candidates, random_state=random_state, data_type='points', metric=metric).initialize(return_index=True) result2 = kmeans_plusplus_initializer(matrix, k, candidates, random_state=random_state, data_type='distance_matrix', metric=metric).initialize(return_index=True) assertion.eq(result1, result2)
def perform_clustering(embs, method='kmeans'): """Perform clusering based on specified method.""" if method == 'kmeans': num_speakers = est_num_clusters(embs, max_num=7, init_num=2) cluster = KMeans(n_clusters=num_speakers, init='k-means++', n_jobs=-1, random_state=0) # cluster.fit_predict(embs) # pred_labels = cluster.labels_ cluster.fit(embs) centroids = cluster.cluster_centers_ pred_labels = np.argmax(cosine_similarity(embs, centroids), axis=1) elif method == 'plda_kmeans': num_speakers = est_num_clusters(embs, max_num=7, init_num=2) cluster = IvecKMeans(np.array(kmeans_plusplus_initializer(embs, num_speakers).initialize()), num_speakers, score_method='plda') cluster.fit(embs) pred_labels = cluster.old_labels elif method == 'cosine_kmeans': num_speakers = est_num_clusters(embs, max_num=7, init_num=2) cluster = IvecKMeans(np.array(kmeans_plusplus_initializer(embs, num_speakers).initialize()), num_speakers, score_method='cosine') cluster.fit(embs) pred_labels = cluster.labels() elif method == 'rcc': cluster = rcc.RccCluster(k=10, measure='cosine', clustering_threshold=1, verbose=False) pred_labels = cluster.fit(embs) elif method == 'spectral': cluster = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', n_neighbors=10, n_jobs=-1) cluster.fit_predict(embs) pred_labels = cluster.labels_ elif method == 'spectral_cosine': sigma_squared = 0.5 cosine_dist = 1 - cosine_similarity(embs) affinity = np.exp(-np.power(cosine_dist, 2) / sigma_squared) if np.isnan(affinity).any() or np.isinf(affinity).any(): raise ValueError('Affinity matrix contains NaN.') norms = np.linalg.norm(embs, axis=1) print(np.max(norms), np.min(norms)) print(np.max(affinity), np.min(affinity)) cluster = SpectralClustering(n_clusters=2, affinity='precomputed', n_jobs=-1) cluster.fit(affinity) pred_labels = cluster.labels_ else: raise ValueError('Clustering method not defined.') return pred_labels
def get_modelo(self, algoritmo, eps, neig): print(algoritmo + ' ' + str(eps) + ' - ' + str(neig)) instance = None if algoritmo == 'AGNES': instance = agglomerative(self.amostras, self.numero_clusters, link=None) elif algoritmo == 'BIRCH': instance = birch(self.amostras, self.numero_clusters, entry_size_limit=10000) elif algoritmo == 'CLARANS': instance = clarans(self.amostras, self.numero_clusters, numlocal=100, maxneighbor=1) elif algoritmo == 'CURE': instance = cure(self.amostras, self.numero_clusters, number_represent_points=5, compression=0.5) elif algoritmo == 'DBSCAN': instance = dbscan(self.amostras, eps=eps, neighbors=neig) elif algoritmo == 'FCM': initial_centers = kmeans_plusplus_initializer( self.amostras, self.numero_clusters).initialize() instance = fcm(self.amostras, initial_centers) elif algoritmo == 'KMEANS': initial_centers = kmeans_plusplus_initializer( self.amostras, self.numero_clusters).initialize() instance = kmeans(self.amostras, initial_centers, tolerance=0.001) elif algoritmo == 'KMEDOIDS': instance = kmedoids(self.amostras, initial_index_medoids=[0, 0, 0, 0, 0, 0, 0], tolerance=0.0001) #ajustar o n_de cluster elif algoritmo == 'OPTICS': instance = optics(self.amostras, eps=eps, minpts=neig) elif algoritmo == 'ROCK': instance = rock(self.amostras, eps=eps, number_clusters=self.numero_clusters, threshold=0.5) else: pass instance.process() lista_agrupada = self.get_lista_agrupada(instance.get_clusters()) lista_agrupada = np.array(lista_agrupada) if (neig != 0): n_grupos = len(np.unique(lista_agrupada)) if n_grupos > self.numero_clusters: lista_agrupada = self.get_modelo(algoritmo, eps, neig + 1) return lista_agrupada
def xmeans_cluster(self, domain_features): final_centers = None final_radiuses = None final_clusters = None for i in range(5): initial_centers = kmeans_plusplus_initializer(domain_features, 2).initialize() # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum max_num = int(len(domain_features) / 2) xmeans_instance = xmeans(domain_features, initial_centers, max_num) xmeans_instance.process() centers = xmeans_instance.get_centers() flag = False if i == 0 or len(centers) > len(final_centers): flag = True if flag: radiuses = [] cluster_num = 1 for cluster in xmeans_instance.get_clusters(): cluster_num = cluster_num + 1 radius_total = 0.0 for i in cluster: dist = np.linalg.norm(domain_features[i] - centers[cluster_num - 2]) radius_total += dist radiuses.append(radius_total / len(cluster)) final_centers = xmeans_instance.get_centers() final_radiuses = radiuses final_clusters = xmeans_instance.get_clusters() return final_centers, final_radiuses, final_clusters
def _search_optimal_parameters(self, data, amount): """! @brief Performs cluster analysis for specified data several times to find optimal clustering result in line with WCE. @param[in] data (array_like): Input data that should be clustered. @param[in] amount (unit): Amount of clusters that should be allocated. @return (tuple) Optimal clustering result: (clusters, centers, wce). """ best_wce, best_clusters, best_centers = float('+inf'), [], [] for _ in range(self.__repeat): initial_centers = kmeans_plusplus_initializer( data, amount, random_state=self.__random_state).initialize() solver = kmeans(data, initial_centers, tolerance=self.__tolerance, ccore=False).process() candidate_wce = solver.get_total_wce() if candidate_wce < best_wce: best_wce = candidate_wce best_clusters = solver.get_clusters() best_centers = solver.get_centers() if len(initial_centers) == 1: break # No need to rerun clustering for one initial center. return best_clusters, best_centers, best_wce
def elbow_analysis(sample_file_path, kmin, kmax, **kwargs): initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(sample_file_path) elbow_instance = elbow(sample, kmin, kmax, initializer=initializer) elbow_instance.process() amount_clusters = elbow_instance.get_amount() wce = elbow_instance.get_wce() centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize() kmeans_instance = kmeans(sample, centers) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() print("Sample '%s': Obtained amount of clusters: '%d'." % (sample_file_path, amount_clusters)) figure = plt.figure(1) ax = figure.add_subplot(111) ax.plot(range(kmin, kmax), wce, color='b', marker='.') ax.plot(amount_clusters, wce[amount_clusters - kmin], color='r', marker='.', markersize=10) ax.annotate("Elbow", (amount_clusters + 0.1, wce[amount_clusters - kmin] + 5)) ax.grid(True) plt.ylabel("WCE") plt.xlabel("K") plt.show() kmeans_visualizer.show_clusters(sample, clusters, centers)
def __init__(self, data, initial_centers=None, kmax=20, tolerance=0.001, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=True, **kwargs): """! @brief Constructor of clustering algorithm X-Means. @param[in] data (array_like): Input data that is presented as list of points (objects), each point should be represented by list or tuple. @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: `[center1, center2, ...]`, if it is not specified then X-Means starts from the random center. @param[in] kmax (uint): Maximum number of clusters that can be allocated. @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing. @param[in] criterion (splitting_type): Type of splitting creation (by default `splitting_type.BAYESIAN_INFORMATION_CRITERION`). @param[in] ccore (bool): Defines if C++ pyclustering library should be used instead of Python implementation. @param[in] **kwargs: Arbitrary keyword arguments (available arguments: `repeat`, `random_state`, `metric`, `alpha`, `beta`). <b>Keyword Args:</b><br> - repeat (unit): How many times K-Means should be run to improve parameters (by default is `1`). With larger `repeat` values suggesting higher probability of finding global optimum. - random_state (int): Seed for random state (by default is `None`, current system time is used). - metric (distance_metric): Metric that is used for distance calculation between two points (by default euclidean square distance). - alpha (double): Parameter distributed [0.0, 1.0] for alpha probabilistic bound \f$Q\left(\alpha\right)\f$. The parameter is used only in case of MNDL splitting criterion, in all other cases this value is ignored. - beta (double): Parameter distributed [0.0, 1.0] for beta probabilistic bound \f$Q\left(\beta\right)\f$. The parameter is used only in case of MNDL splitting criterion, in all other cases this value is ignored. """ self.__pointer_data = numpy.array(data) self.__clusters = [] self.__random_state = kwargs.get('random_state', None) self.__metric = copy.copy( kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))) if initial_centers is not None: self.__centers = numpy.array(initial_centers) else: self.__centers = kmeans_plusplus_initializer( data, 2, random_state=self.__random_state).initialize() self.__kmax = kmax self.__tolerance = tolerance self.__criterion = criterion self.__total_wce = 0.0 self.__repeat = kwargs.get('repeat', 1) self.__alpha = kwargs.get('alpha', 0.9) self.__beta = kwargs.get('beta', 0.9) self.__ccore = ccore and self.__metric.get_type( ) != type_metric.USER_DEFINED if self.__ccore is True: self.__ccore = ccore_library.workable() self.__verify_arguments()
def elbow_analysis(sample_file_path, kmin, kmax, **kwargs): initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(sample_file_path) elbow_instance = elbow(sample, kmin, kmax, initializer=initializer) elbow_instance.process() amount_clusters = elbow_instance.get_amount() wce = elbow_instance.get_wce() centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize() kmeans_instance = kmeans(sample, centers) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() print("Sample '%s': Obtained amount of clusters: '%d'." % (sample_file_path, amount_clusters)) figure = plt.figure(1) ax = figure.add_subplot(111) ax.plot(range(kmin, kmax), wce, color='b', marker='.') ax.plot(amount_clusters, wce[amount_clusters - kmin], color='r', marker='.', markersize=10) ax.annotate("Elbow", (amount_clusters + 0.1, wce[amount_clusters - kmin] + 5)) ax.grid(True) plt.ylabel("WCE") plt.xlabel("K") plt.show() kmeans_visualizer.show_clusters(sample, clusters, centers)
def subcluster(dataset): kmin = 1 kmax = 20 if kmax > len(dataset): kmax = len(dataset) optimal_clusters = 1 # Determining Clusters # Might potentially be inefficient technique # Instead of elbow, could again repeat what is done # in the main clustering, going through K values # Choosing one with lowest error from calcError # This could be very time intensive however if kmax - kmin <= 3: optimal_clusters = int((kmin + kmax) / 2) else: elbow_inst = elbow(dataset, kmin, kmax) elbow_inst.process() optimal_clusters = elbow_inst.get_amount() if optimal_clusters > len(dataset): optimal_clusters = len(dataset) initial_centers = kmeans_plusplus_initializer( dataset, optimal_clusters).initialize() metric = distance_metric(type_metric.EUCLIDEAN) kmeans_instance = kmeans(dataset, initial_centers, metric=metric) kmeans_instance.process() clusters = kmeans_instance.get_clusters() return clusters
def templateKmeasPlusPlusCenterInitializer(self, data, amount): centers = kmeans_plusplus_initializer(data, amount).initialize() self.assertEqual(amount, len(centers)) self.assertEqual(len(data[0]), len(centers[0])) return centers
def __search_optimial_parameters(self, local_data): """! @brief Split data of the region into two cluster and tries to find global optimum by running k-means clustering several times (defined by 'repeat' argument). @param[in] local_data (list): Points of a region that should be split into two clusters. @return (tuple) List of allocated clusters, list of centers and total WCE (clusters, centers, wce). """ optimal_wce, optimal_centers, optimal_clusters = float( '+inf'), None, None for _ in range(self.__repeat): candidates = 5 if len(local_data) < candidates: candidates = len(local_data) local_centers = kmeans_plusplus_initializer( local_data, 2, candidates).initialize() kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False) kmeans_instance.process() local_wce = kmeans_instance.get_total_wce() if local_wce < optimal_wce: optimal_centers = kmeans_instance.get_centers() optimal_clusters = kmeans_instance.get_clusters() optimal_wce = local_wce return optimal_clusters, optimal_centers, optimal_wce
def templateKmeansPlusPlusSeveralRuns(self, path_sample, amount, candidates): sample = read_sample(path_sample) attempts = 10 for _ in range(attempts): medoids = kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True) medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True) medoids += kmeans_plusplus_initializer(sample, amount, candidates).initialize(return_index=True) unique_medoids = set(medoids) if len(unique_medoids) != len(medoids): continue return self.assertTrue(False, "K-Means++ does not return unique medoids during %d attempts." % attempts)
def cluster_xMean_FixSize(binsList, amount_initial_centers=3, kmax=10): sample = [] means = [] vars = [] slopes = [] for i, bin in enumerate(binsList): sample.append( [bin.get_representation(), bin.get_variance(), bin.get_slope()]) means.append(bin.get_representation()) vars.append(bin.get_variance()) slopes.append(bin.get_slope()) # Prepare initial centers - amount of initial centers defines amount of clusters from which X-Means will # start analysis. initial_centers = kmeans_plusplus_initializer( sample, amount_initial_centers).initialize() # # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum # # number of clusters that can be allocated is 20. xmeans_instance = xmeans(sample, initial_centers, kmax) xmeans_instance.process() # # Extract clustering results: clusters and their centers clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() print(len(clusters)) return {"clusters": clusters, "centers": centers}
def get_kmeans_clusters(data, count_centers): rows = data.getRows() input_data = list() result_clusters = list() for row in rows: input_data.append(row.getDataArray()) SST = calculate_sst(input_data) # initialize initial centers using K-Means++ method initial_centers = kmeans_plusplus_initializer( input_data, count_centers).initialize() # create instance of K-Means algorithm with prepared centers kmeans_instance = kmeans(input_data, initial_centers) # run cluster analysis and obtain results kmeans_instance.process() clusters = kmeans_instance.get_clusters() colorRange = Constants.DEFAULT_COLOR_SET SSB = 0 SSW = 0 for i, cluster in enumerate(clusters): result_cluster = Cluster( KMeansWindow.get_rows_kmeans(data, cluster)) ro = KMeansWindow.get_rows_kmeans(data, cluster) f = [x._dataArray for x in ro] SSW = SSW + calculate_ssw(f) colour = random.choice(colorRange) result_cluster.setName(colour) result_cluster.setColor(colour) result_clusters.append(result_cluster) SSB = calculate_ssb(SST, SSW) RS_RESULT.append(SSB / SST) print(RS_RESULT) return result_clusters
def __improve_parameters(self, centers, available_indexes=None): """! @brief Performs k-means clustering in the specified region. @param[in] centers (list): Cluster centers, if None then automatically generated two centers using center initialization method. @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None then all points are used. @return (tuple) List of allocated clusters, list of centers and total WCE. """ if available_indexes and len(available_indexes) == 1: index_center = available_indexes[0] return [available_indexes], self.__pointer_data[index_center], 0.0 local_data = self.__pointer_data if available_indexes: local_data = [self.__pointer_data[i] for i in available_indexes] local_centers = centers if centers is None: local_centers = kmeans_plusplus_initializer(local_data, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize() kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False) kmeans_instance.process() local_wce = kmeans_instance.get_total_wce() local_centers = kmeans_instance.get_centers() clusters = kmeans_instance.get_clusters() if available_indexes: clusters = self.__local_to_global_clusters(clusters, available_indexes) return clusters, local_centers, local_wce
def calculate_fitness(ind, df, X, target, func_set, operations, number_of_clusters): ind_exp = ind.unroll_expression([]) def fitness_distance(data1, data2): """ input: point1 e point2 = pontos utilizados no cálculo da distância output: result = distância entre os dois pontos """ result = eval(data1, data2, func_set, operations, ind_exp) return result # distance function fitness_metric = distance_metric(type_metric.USER_DEFINED, func=fitness_distance) k = number_of_clusters initial_centers = kmeans_plusplus_initializer(X, k).initialize() kmeans_instance = kmeans(X, initial_centers, metric=fitness_metric) kmeans_instance.process() clusters = kmeans_instance.get_clusters() for i in range(len(clusters)): df.loc[clusters[i], 'y_pred'] = i score = v_measure_score(target, df.y_pred) # reseting dataframe df = df.drop(['y_pred'], axis=1) return score
def __improve_parameters(self, centers, available_indexes = None): """! @brief Performs k-means clustering in the specified region. @param[in] centers (list): Centers of clusters. @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used. @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data. """ if available_indexes and len(available_indexes) == 1: index_center = available_indexes[0] return [ available_indexes ], self.__pointer_data[index_center] local_data = self.__pointer_data if available_indexes: local_data = [ self.__pointer_data[i] for i in available_indexes ] local_centers = centers if centers is None: local_centers = kmeans_plusplus_initializer(local_data, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize() kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False) kmeans_instance.process() local_centers = kmeans_instance.get_centers() clusters = kmeans_instance.get_clusters() if available_indexes: clusters = self.__local_to_global_clusters(clusters, available_indexes) return clusters, local_centers
def templateKmeansPlusPlusUnique(self, path_sample, amount, candidates): sample = read_sample(path_sample) start_medoids = kmeans_plusplus_initializer( sample, amount, candidates).initialize(return_index=True) unique_mediods = set(start_medoids) self.assertEqual(len(unique_mediods), len(start_medoids))
def x_means(X, num_init_clusters=8, visualize=True): from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer from pyclustering.cluster.xmeans import xmeans from pyclustering.cluster import cluster_visualizer from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer from pyclustering.cluster import cluster_visualizer_multidim X = list(X) start_centers = kmeans_plusplus_initializer( X, num_init_clusters).initialize() xmeans_instance = xmeans(X, start_centers, 32, ccore=True, criterion=0) # Run cluster analysis and obtain results. xmeans_instance.process() clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() print('Number of cluster centers calculated :', len(centers)) if visualize: visualizer = cluster_visualizer_multidim() visualizer.append_clusters(clusters, X) visualizer.show() return centers, clusters
def cl_xmeans(sample): initial_centers = kmeans_plusplus_initializer(sample, 2).initialize() xmeans_instance = xmeans(sample, initial_centers, 20) xmeans_instance.process() return xmeans_instance.get_clusters() # slc: single linkage clustering
def consensus_clustering(input_path,CLUSTERING_PATH): df = pd.read_csv(CLUSTERING_PATH+"distance_matrix.csv", delimiter=',', header=None) #read all the values sample = df.values #number of elements N= len(df.columns) print(isinstance(np.asmatrix(sample),np.matrix)) #rule of thumbs for k df1= pd.DataFrame(columns=['value'],index=['k_sqrtNBy4','k_sqrtNDiv4','k_sqrtNDiv2','k_sqrtNBy2','k_sqrtN',]) #df1.at['k_1','value']= 1 df1.at['k_sqrtN','value']= round(sqrt(N),0) df1.at['k_sqrtNDiv2', 'value'] = round(sqrt(N / 2),0) df1.at['k_sqrtNBy2', 'value'] = round(sqrt(N * 2),0) df1.at['k_sqrtNDiv4', 'value'] = round(sqrt(N / 4),0) df1.at['k_sqrtNBy4', 'value'] = round(sqrt(N*4),0) # Declare the weight of each vote # consensus matrix is NxN #initialization iterations=20 weight1 = 1 / iterations weight2 = 1 / len(df1.index)#the amount of k values used consensus_matrix = np.zeros((N, N)) for k in df1.index: #run the same algorithm using several k values. Each configuration is run #iterations times. for iteration in range(iterations): k_value=int(df1.loc[k].values[0]) initial_medoids = kmeans_plusplus_initializer(sample,k_value).initialize(return_index=True) kmedoids_instance = kmedoids(np.asmatrix(sample), initial_medoids,data_type="distance_matrix") kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() coassociations_matrix= np.zeros((N, N)) for cluster in clusters: for crypto in cluster: #set the diagonal elements with value 1 coassociations_matrix[crypto][crypto] = 1 for crypto1 in cluster: coassociations_matrix[crypto][crypto1]= 1 coassociations_matrix[crypto1][crypto] = 1 #sum the two matrices consensus_matrix=consensus_matrix+coassociations_matrix consensus_matrix = consensus_matrix*weight1*weight2 #now, by doing (1 - consensus_matrix) we get the dissimilarity/distance matrix distance_matrix= 1-consensus_matrix df = pd.DataFrame(data=distance_matrix) df.to_csv(CLUSTERING_PATH+"consensus_matrix(distance).csv",sep=",") #Hierarchical clustering for k in df1.index: k_value = int(df1.loc[k].values[0]) agglomerative_instance = agglomerative(distance_matrix,k_value, type_link.AVERAGE_LINK) agglomerative_instance.process() # Obtain results of clustering clusters = agglomerative_instance.get_clusters() save_clusters(input_path,clusters,k,CLUSTERING_PATH)
def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs): sample = read_sample(path_to_file) data_type = kwargs.get('data_type', 'points') input_type = kwargs.get('input_type', 'list') initialize_medoids = kwargs.get('initialize_medoids', None) if metric is None: metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) input_data = sample if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) if input_type == 'numpy': input_data = numpy.array(input_data) testing_result = False testing_attempts = 1 if initialize_medoids is not None: # in case center initializer randomization appears testing_attempts = 10 for _ in range(testing_attempts): if initialize_medoids is not None: initial_medoids = kmeans_plusplus_initializer( sample, initialize_medoids).initialize(return_index=True) kmedoids_instance = kmedoids(input_data, initial_medoids, 0.025, ccore_flag, metric=metric, data_type=data_type) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() if len(clusters) != len(medoids): continue if len(set(medoids)) != len(medoids): continue obtained_cluster_sizes = [len(cluster) for cluster in clusters] if len(sample) != sum(obtained_cluster_sizes): continue if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() if obtained_cluster_sizes != expected_cluster_length: continue testing_result = True assertion.true(testing_result)
def xmeansRoutine(self): self.initial_centers = kmeans_plusplus_initializer( self.datalist, self.amount_initial_centers).initialize() self.xmeans_instance = xmeans(self.datalist, self.initial_centers, self.amount_max_centers) self.xmeans_instance.process() self.clusters = self.xmeans_instance.get_clusters() self.centers = self.xmeans_instance.get_centers()
def templateKmeasPlusPlusCenterInitializer(self, data, amount): centers = kmeans_plusplus_initializer(data, amount).initialize() assertion.eq(amount, len(centers)) for center in centers: assertion.eq(len(data[0]), len(center)) return centers
def templateKmeasPlusPlusCenterInitializer(self, data, amount): centers = kmeans_plusplus_initializer(data, amount).initialize() assertion.eq(amount, len(centers)) for center in centers: assertion.eq(len(data[0]), len(center)) return centers
def clusterData(data, numberOfConversations, getData): initial_centers = kmeans_plusplus_initializer(data, numberOfConversations).initialize() instance = kmeans(data, initial_centers) instance.process() #kmeans_visualizer.show_clusters(data, instance.get_clusters(), instance.get_centers(), initial_centers) if getData: # returns list that specifies which messages belong to which conversation return instance.get_clusters() else: # returns time list of when each conversation occured return instance.get_centers()
def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs): sample = read_sample(path_to_file) data_type = kwargs.get('data_type', 'points') input_type = kwargs.get('input_type', 'list') initialize_medoids = kwargs.get('initialize_medoids', None) itermax = kwargs.get('itermax', 200) if metric is None: metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) input_data = sample if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) if input_type == 'numpy': input_data = numpy.array(input_data) testing_result = False testing_attempts = 1 if initialize_medoids is not None: # in case center initializer randomization appears testing_attempts = 10 for _ in range(testing_attempts): if initialize_medoids is not None: initial_medoids = kmeans_plusplus_initializer(sample, initialize_medoids).initialize(return_index=True) kmedoids_instance = kmedoids(input_data, initial_medoids, 0.001, ccore_flag, metric=metric, data_type=data_type, itermax=itermax) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() if itermax == 0: assertion.eq([], clusters) assertion.eq(medoids, initial_medoids) return if len(clusters) != len(medoids): continue if len(set(medoids)) != len(medoids): continue obtained_cluster_sizes = [len(cluster) for cluster in clusters] if len(sample) != sum(obtained_cluster_sizes): continue if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() if obtained_cluster_sizes != expected_cluster_length: continue testing_result = True assertion.true(testing_result)
def templateKmeasPlusPlusCenterInitializerIndexReturn(self, data, amount): centers = kmeans_plusplus_initializer(data, amount).initialize(return_index=True) assertion.eq(amount, len(centers)) for center_index in centers: assertion.gt(len(data), center_index) assertion.le(0, center_index) return centers
def clustering_with_answer(data_file, answer_file, ccore, **kwargs): data = read_sample(data_file) reader = answer_reader(answer_file) amount_medoids = len(reader.get_clusters()) initial_medoids = kmeans_plusplus_initializer( data, amount_medoids, **kwargs).initialize(return_index=True) kmedoids_instance = kmedoids(data, initial_medoids, 0.001, ccore, **kwargs) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() expected_length_clusters = sorted(reader.get_cluster_lengths()) assertion.eq(len(expected_length_clusters), len(medoids)) assertion.eq(len(data), sum([len(cluster) for cluster in clusters])) assertion.eq(sum(expected_length_clusters), sum([len(cluster) for cluster in clusters])) unique_medoids = set() for medoid in medoids: assertion.false( medoid in unique_medoids, message="Medoids '%s' is not unique (actual medoids: '%s')" % (str(medoid), str(unique_medoids))) unique_medoids.add(medoid) unique_points = set() for cluster in clusters: for point in cluster: assertion.false( point in unique_points, message= "Point '%s' is already assigned to one of the clusters." % str(point)) unique_points.add(point) assertion.eq(expected_length_clusters, sorted([len(cluster) for cluster in clusters])) expected_clusters = reader.get_clusters() for actual_cluster in clusters: cluster_found = False for expected_cluster in expected_clusters: if actual_cluster == expected_cluster: cluster_found = True assertion.true( cluster_found, message="Actual cluster '%s' is not found among expected." % str(actual_cluster))
def __run_feature_xmeans(self, features, num_init_centers = 10, max_centers = 30, \ clust_size_threshold = 1, dist_threshold = 10) -> list: # run xmeans algorithm initial_centers = kmeans_plusplus_initializer( features, num_init_centers).initialize() algo = xmeans(features, initial_centers=initial_centers, kmax=max_centers) algo.process() centroids, clusters = algo.get_centers(), algo.get_clusters() # pre-process centroids p_centroids = [] for coord in centroids: row, col = coord[0], coord[1] p_centroids.append((int(round(row)), int(round(col)))) # determine close centroids comb_indices = set() for comb in itertools.combinations(range(len(p_centroids)), 2): cen, c_cen = p_centroids[comb[0]], p_centroids[comb[1]] dist = math.sqrt((cen[0] - c_cen[0])**2 + (cen[1] - c_cen[1])**2) if dist <= dist_threshold: comb_indices.add(frozenset(comb)) # find transitive centroid clusters trans_centroids = [] for comb in comb_indices: addedFlag = False for i in range(len(trans_centroids)): if len(trans_centroids[i].intersection(comb)): trans_centroids[i] = trans_centroids[i].union(comb) addedFlag = True break if not addedFlag: trans_centroids.append(frozenset(comb)) # combine close transitive centroids sets c_centroids, added_indices = [], set() for combs in trans_centroids: n_centroid = [0, 0] for c_idx in combs: added_indices.add(c_idx) n_centroid[0] += centroids[c_idx][0] n_centroid[1] += centroids[c_idx][1] n_centroid[0] /= len(combs) n_centroid[1] /= len(combs) c_centroids.append(n_centroid) # purge under-sized clusters for c_idx in range(len(centroids)): if c_idx in added_indices or len(clusters[c_idx]) \ <= clust_size_threshold: continue c_centroids.append(centroids[c_idx]) return c_centroids
def est_num_clusters(embs, max_num, init_num): """Use xmeans to estimate number of speakers.""" embs_list = embs.tolist() initial_centers = kmeans_plusplus_initializer(embs_list, init_num).initialize() xm = xmeans(embs_list, initial_centers, kmax=max_num, ccore=True) xm.process() num_speakers = len(xm.get_clusters()) print('Estimated number of speakers: ' + str(num_speakers)) return num_speakers
def templateDrawClustersNoFailure(self, data_path, amount_clusters): sample = read_sample(data_path); initial_centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize(); kmeans_instance = kmeans(sample, initial_centers, amount_clusters); kmeans_instance.process(); clusters = kmeans_instance.get_clusters(); ax = draw_clusters(sample, clusters); assert None != ax;
def templateKmeasPlusPlusCenterInitializerIndexReturn(self, data, amount): centers = kmeans_plusplus_initializer(data, amount).initialize(return_index=True) assertion.eq(amount, len(centers)) for center_index in centers: assertion.gt(len(data), center_index) assertion.le(0, center_index) assertion.eq(1, centers.count(center_index)) return centers
def template_kmeans_plusplus_initializer(path, amount, draw=True): sample = read_sample(path) centers = kmeans_plusplus_initializer(sample, amount, 1).initialize() if draw is True: visualizer = cluster_visualizer() visualizer.append_cluster(sample) visualizer.append_cluster(centers, marker='*', markersize=10) visualizer.show() return sample, centers
def template_segmentation_image_amount_colors(source, amount): data = read_image(source) centers = kmeans_plusplus_initializer( data, amount, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize() kmeans_instance = kmeans(data, centers) kmeans_instance.process() clusters = kmeans_instance.get_clusters() draw_image_mask_segments(source, clusters)
def template_kmeans_plusplus_initializer(path, amount, draw = True): sample = read_sample(path); centers = kmeans_plusplus_initializer(sample, amount).initialize(); if (draw is True): visualizer = cluster_visualizer(); visualizer.append_cluster(sample); visualizer.append_cluster(centers, marker = '*', markersize = 10); visualizer.show(); return (sample, centers);
def templateKmeansPlusPlusForKmedoidsClustering(self, path_sample, amount, expected_clusters_length): result_success = True for _ in range(3): try: sample = read_sample(path_sample) start_medoids = kmeans_plusplus_initializer(sample, amount).initialize(return_index=True) KmedoidsTestTemplates.templateLengthProcessData(path_sample, start_medoids, expected_clusters_length, False) except AssertionError: continue break assert result_success == True;
def find_optimal_amout_clusters(sample_path, kmin, kmax, algorithm): sample = read_sample(sample_path) search_instance = silhouette_ksearch(sample, kmin, kmax, algorithm=algorithm).process() amount = search_instance.get_amount() scores = search_instance.get_scores() print("Sample: '%s', Scores: '%s'" % (sample_path, str(scores))) initial_centers = kmeans_plusplus_initializer(sample, amount).initialize() kmeans_instance = kmeans(sample, initial_centers).process() clusters = kmeans_instance.get_clusters() visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) visualizer.show()
def __initialize_kmeans(self): initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize() kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True) kmeans_instance.process() means = kmeans_instance.get_centers() covariances = [] initial_clusters = kmeans_instance.get_clusters() for initial_cluster in initial_clusters: if len(initial_cluster) > 1: cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ] covariances.append(numpy.cov(cluster_sample, rowvar = False)) else: dimension = len(self.__sample[0]) covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0) return means, covariances
def cluster_iris(): start_centers = kmeans_plusplus_initializer(read_sample(FAMOUS_SAMPLES.SAMPLE_IRIS), 4).initialize() template_clustering(start_centers, FAMOUS_SAMPLES.SAMPLE_IRIS)