def fit_new_trainig(vectors, algo, metric, path_to_save=None, nb_clusters=300, max_no_improvement=1000, verbose=1, shuffle=True): assert algo in ['MiniBatchKMeans', 'KMedoids'] if shuffle: np.random.shuffle(vectors) if algo == 'MiniBatchKMeans': clusterer = MiniBatchKMeans(n_clusters=nb_clusters, verbose=verbose, max_no_improvement=max_no_improvement) if algo == 'KMedoids': if metric is not None: clusterer = KMedoids(n_clusters=nb_clusters, max_iter=max_no_improvement, metric=metric) else: clusterer = KMedoids(n_clusters=nb_clusters, max_iter=max_no_improvement) clusterer.fit(vectors) if path_to_save is not None: np.save(path_to_save, clusterer.cluster_centers_) return clusterer
def test_kmedoids(dtw_value, cluster_num, seed): # 声明precomputed自定义相似度计算方法 km = KMedoids(n_clusters=cluster_num, random_state=seed, metric="precomputed", init='k-medoids++', max_iter=30000) dists = dtw_value y_pred = km.fit_predict(dists) with open(r".//res//grid_pred_d" + str(cluster_num) + ".csv", "w", encoding='UTF-8', newline='') as csvfile: writer = csv.writer(csvfile) index = 0 for row in y_pred: writer.writerow([row]) index += 1 with open(r".//res//grid_centroids_d" + str(cluster_num) + ".csv", "w", encoding='UTF-8', newline='') as csvfile: writer = csv.writer(csvfile) for yi in range(cluster_num): writer.writerow([km.medoid_indices_[yi]]) print('finish')
def build_k_medoids(factors: np.ndarray): """ Builds a KMedoids model from the given factors """ if CONFIG.use_k: k = CONFIG.k LOG.debug('Running K-Medoids with k=%s clusters...', k) model = KMedoids(n_clusters=k, max_iter=500).fit(factors) else: best_model, best_score = None, float('-inf') scores = [] for i in range(1, CONFIG.max_clusters): k = i + 1 LOG.debug('Starting K-medoids with %s clusters...', k) start = time() model = KMedoids(n_clusters=k, max_iter=500).fit(factors) score = silhouette_score(factors, model.labels_) scores.append(score) LOG.info( 'Finished K-medoids with %s clusters in %s seconds, score: %s', k, round(time() - start), score) if score > best_score: best_model = model best_score = score LOG.debug('Better score! Saving model with k=%s.', score) model = best_model scores = pd.DataFrame(scores, index=np.arange(1, len(scores) + 1)) scores.to_csv('outputs/k_medoids_silhouette.csv') return model
def test_kmedoids_empty_clusters(): """When a cluster is empty, it should throw a warning.""" rng = np.random.RandomState(seed) X = [[1], [1], [1]] kmedoids = KMedoids(n_clusters=2, random_state=rng) with pytest.warns(UserWarning, match="Cluster 1 is empty!"): kmedoids.fit(X)
def calculate_clusters(answers, real_classes): kmedoids1 = KMedoids(n_clusters=5, metric=distance_function, random_state=None).fit(answers) print(kmedoids1.labels_) kmedoids2 = KMedoids(n_clusters=5, metric=distance_function, random_state=None).fit(scale(answers)) print(kmedoids2.labels_) print("Rand index:") print(rand_index(real_classes, kmedoids1.labels_)) print(rand_index(real_classes, kmedoids2.labels_)) print("Adjusted rand index:") print(adjusted_rand_index(real_classes, kmedoids1.labels_)) print(adjusted_rand_index(real_classes, kmedoids2.labels_)) print("Sum error:") print(sum_error(real_classes, kmedoids1.labels_)) print(sum_error(real_classes, kmedoids2.labels_)) print("NMI:") print(mutual_info_score(real_classes, kmedoids1.labels_)) print(mutual_info_score(real_classes, kmedoids2.labels_))
def k_medoids_clustering(self, n_clusters=3, normalized=True, n_repeats=1, criterion='avg_silhouette'): """ Method that assigns phase labels to PhaseIdentification object obtained by performing K-medoids++ on the specified feeder. A number of repetitions can be specified, the best result according to the specified criterion will be returned By default the features will be normalized first. By scaling the features to have a mean of 0 and unit variance. (More info: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) """ if normalized == True: scaler = StandardScaler() data = scaler.fit_transform(self.voltage_features) else: data = self.voltage_features if criterion == 'avg_silhouette': best_cluster_labels = np.zeros(np.size(data, 0)) score = -1 for i in range(0, n_repeats): i_cluster_labels = KMedoids(n_clusters, init='k-medoids++').fit(data).labels_ i_silhouette_avg = silhouette_score(data, i_cluster_labels) if i_silhouette_avg > score: score = i_silhouette_avg best_cluster_labels = i_cluster_labels if criterion == 'global_silhouette': best_cluster_labels = np.zeros(np.size(data, 0)) score = -1 for i in range(0, n_repeats): i_cluster_labels = KMedoids(n_clusters, init='k-medoids++').fit(data).labels_ i_silhouette_global = global_silhouette_criterion(data, i_cluster_labels) if i_silhouette_global > score: score = i_silhouette_global best_cluster_labels = i_cluster_labels self._algorithm = 'k-medoids++' self._n_repeats = n_repeats self.partial_phase_labels = best_cluster_labels + 1 self.match_labels()
def test_kmedoids_iris(): """Test kmedoids on the Iris dataset""" rng = np.random.RandomState(seed) X_iris = load_iris()["data"] ref_model = KMeans(n_clusters=3).fit(X_iris) avg_dist_to_closest_centroid = (ref_model.transform(X_iris).min( axis=1).mean()) for init in ["random", "heuristic", "k-medoids++"]: distance_metric = "euclidean" model = KMedoids(n_clusters=3, metric=distance_metric, init=init, random_state=rng) model.fit(X_iris) # test convergence in reasonable number of steps assert model.n_iter_ < (len(X_iris) // 10) distances = PAIRWISE_DISTANCE_FUNCTIONS[distance_metric](X_iris) avg_dist_to_random_medoid = np.mean(distances.ravel()) avg_dist_to_closest_medoid = model.inertia_ / X_iris.shape[0] # We want distance-to-closest-medoid to be reduced from average # distance by more than 50% assert avg_dist_to_random_medoid > 2 * avg_dist_to_closest_medoid # When K-Medoids is using Euclidean distance, # we can compare its performance to # K-Means. We want the average distance to cluster centers # to be similar between K-Means and K-Medoids assert_allclose(avg_dist_to_closest_medoid, avg_dist_to_closest_centroid, rtol=0.1)
def __init__(self, data): self.states = data.keys() self.kmeans = KMedoids(n_clusters=3) self.kmeans.fit(np.array(tuple(data.values())).reshape(-1, 1)) self.mapping = list( np.argsort(np.squeeze(self.kmeans.cluster_centers_)))
def test_kmedoid_results(method, init): expected = np.hstack([np.zeros(50), np.ones(50)]) km = KMedoids(n_clusters=2, init=init, method=method, random_state=rng) km.fit(X_cc) # This test use data that are not perfectly separable so the # accuracy is not 1. Accuracy around 0.85 assert (np.mean(km.labels_ == expected) > 0.8) or (1 - np.mean(km.labels_ == expected) > 0.8)
def test_callable_distance_metric(): rng = np.random.RandomState(seed) def my_metric(a, b): return np.sqrt(np.sum(np.power(a - b, 2))) model = KMedoids(random_state=rng, metric=my_metric) labels1 = model.fit_predict(X) assert len(labels1) == 100 assert_array_equal(labels1, model.labels_)
def test_kmedoids_on_sparse_input(): rng = np.random.RandomState(seed) model = KMedoids(n_clusters=2, random_state=rng) row = np.array([1, 0]) col = np.array([0, 4]) data = np.array([1, 1]) X = csc_matrix((data, (row, col)), shape=(2, 5)) labels = model.fit_predict(X) assert len(labels) == 2 assert_array_equal(labels, model.labels_)
def sklearn_kmedoids(ds, numClusters, numSamples): km = KMedoids(n_clusters=numClusters, random_state=0) df = ds.df[["x1", "x2"]] df = df[:numSamples] km.fit(df[["x1", "x2"]].to_numpy()) return pd.DataFrame(km.labels_, columns=["cluster"])
def run_KMedoids(n_clusters, pca_components, data, components): clustering = KMedoids(n_clusters=7, random_state=0) clustering.fit(pca_components) df_seg_pca_kmedoids = pd.concat( [data.reset_index(drop=True), pd.DataFrame(pca_components)], axis=1) df_seg_pca_kmedoids.columns.values[(-1 * components):] = [ "Component " + str(i + 1) for i in range(components) ] df_seg_pca_kmedoids['Cluster'] = clustering.labels_ return df_seg_pca_kmedoids
def test_heuristic_deterministic(): """Result of heuristic init method should not depend on rnadom state.""" rng1 = np.random.RandomState(1) rng2 = np.random.RandomState(2) X = load_iris()["data"] D = euclidean_distances(X) medoids_1 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng1) medoids_2 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng2) assert_array_equal(medoids_1, medoids_2)
def test_max_iter(): """Test that warning message is thrown when max_iter is reached.""" rng = np.random.RandomState(seed) X_iris = load_iris()["data"] model = KMedoids( n_clusters=10, init="random", random_state=rng, max_iter=1 ) msg = "Maximum number of iteration reached before" with pytest.warns(UserWarning, match=msg): model.fit(X_iris)
def test_outlier_robustness(): rng = np.random.RandomState(seed) kmeans = KMeans(n_clusters=2, random_state=rng) kmedoids = KMedoids(n_clusters=2, random_state=rng) X = [[-11, 0], [-10, 0], [-9, 0], [0, 0], [1, 0], [2, 0], [1000, 0]] kmeans.fit(X) kmedoids.fit(X) assert_array_equal(kmeans.labels_, [0, 0, 0, 0, 0, 0, 1]) assert_array_equal(kmedoids.labels_, [0, 0, 0, 1, 1, 1, 1])
def test_kmedoid_nclusters(method, init): n_clusters = 50 km = KMedoids( n_clusters=n_clusters, init=init, method=method, max_iter=1, random_state=rng, ) km.fit(X_cc) assert len(np.unique(km.medoid_indices_)) == n_clusters
def test_build(): X, y = fetch_20newsgroups_vectorized(return_X_y=True) # Select only the first 500 samples X = X[:500] y = y[:500] # Precompute cosine distance matrix diss = cosine_distances(X) # run build ske = KMedoids(20, "precomputed", init="build", max_iter=0) ske.fit(diss) assert ske.inertia_ <= 230 assert len(np.unique(ske.labels_)) == 20
def generate_clusteringInfo(filePath): results = [] numberOfImages = 0 # store features of each image into the results with open(filePath) as f: reader = csv.reader(f) for row in reader: features = [float(x) for x in row[:]] results.append(features) numberOfImages += 1 f.close() # Define prefix of the lables and centres file prefix = "basic" if "cnn" in filePath: prefix = "cnn" # store results as an np array allImages = np.array(results) if prefix == "basic": Kmedoids = KMedoids(n_clusters=10, metric=chi2_distance, method='pam', random_state=0).fit(allImages) else: Kmedoids = KMedoids(n_clusters=10, metric='cosine', method='pam', random_state=0).fit(allImages) labels = Kmedoids.labels_ centres = Kmedoids.cluster_centers_ # write centroid and label information to prefix_lables and prefix_centres file output = open(prefix + "_labels.csv", "w") labelsInfo = [str(l) for l in labels] output.write(",".join(labelsInfo)) output.close() output = open(prefix + "_centres.csv", "w") for i in range(0, len(centres)): centre = [] for j in range(0, len(centres[i])): centre.append(str(centres[i][j])) output.write(",".join(centre) + "\n") output.close() SC = metrics.silhouette_score(allImages, labels) print(prefix, "Silhouette Coefficient: ", SC)
def find_optimal_clusters_and_display(pca_components): wcss = [] max_clusters = 21 for i in range(1, max_clusters): kmedoids_pca = KMedoids(n_clusters=i, random_state=0) kmedoids_pca.fit(pca_components) wcss.append(kmedoids_pca.inertia_) n_clusters = KneeLocator([i for i in range(1, max_clusters)], wcss, curve='convex', direction='decreasing').knee st.write("Optimal number of clusters", n_clusters) return n_clusters
def test_kpp_called(_kpp_init_mocked): """KMedoids._kpp_init method should be called by _initialize_medoids""" D = np.array([[0, 1], [1, 0]]) n_clusters = 2 rng = np.random.RandomState(seed) kmedoids = KMedoids() kmedoids.init = "k-medoids++" # set _kpp_init_mocked.return_value to a singleton initial_medoids = kmedoids._initialize_medoids(D, n_clusters, rng) # assert that _kpp_init was called and its result was returned. _kpp_init_mocked.assert_called_once_with(D, n_clusters, rng) assert initial_medoids == _kpp_init_mocked.return_value
def test_update_medoid_idxs_empty_cluster(): """Label is unchanged for an empty cluster.""" D = np.zeros((3, 3)) labels = np.array([0, 0, 0]) medoid_idxs = np.array([0, 1]) kmedoids = KMedoids(n_clusters=2) # Swallow empty cluster warning with warnings.catch_warnings(): warnings.simplefilter("ignore") kmedoids._update_medoid_idxs_in_place(D, labels, medoid_idxs) assert_array_equal(medoid_idxs, [0, 1])
def test_seuclidean(): with pytest.warns(None) as record: km = KMedoids(2, metric="seuclidean", method="pam") km.fit(np.array([0, 0, 0, 1]).reshape((4, 1))) km.predict(np.array([0, 0, 0, 1]).reshape((4, 1))) km.transform(np.array([0, 0, 0, 1]).reshape((4, 1))) assert len(record) == 0
class Clustering: def __init__(self, data): self.states = data.keys() self.kmeans = KMedoids(n_clusters=3) self.kmeans.fit(np.array(tuple(data.values())).reshape(-1, 1)) self.mapping = list( np.argsort(np.squeeze(self.kmeans.cluster_centers_))) def cluster(self): result = [[], [], []] for state, cluster in zip(self.states, self.kmeans.labels_): result[self.mapping.index(cluster)].append(state) return result
def __init__(self, cluster_name='KMedoids', columns=None, eval_inertia=False, eval_silhouette=False, eval_chi=False, eval_dbi=False, eval_sample_size=None, **kwargs): self.cluster_name = cluster_name self.columns = columns self.model = KMedoids(**kwargs) self.eval_inertia = eval_inertia self.eval_silhouette = eval_silhouette self.eval_chi = eval_chi self.eval_dbi = eval_dbi self.eval_sample_size = eval_sample_size self.transform_cols = None self.eval_df = None self.centroid_df = None
def find_cluster_centres(text, num_clusters): corpus = nltk.sent_tokenize(text) corpus_embeddings = embedder.encode(corpus) clustering_model = KMedoids(n_clusters=num_clusters, random_state=0, metric="cosine") clustering_model.fit(corpus_embeddings) cluster_center_embeddings = clustering_model.cluster_centers_ cluster_centers = [] for center_embedding in cluster_center_embeddings: for index, sentence_embedding in enumerate(corpus_embeddings): if np.array_equal(sentence_embedding, center_embedding): if corpus[index] not in cluster_centers: cluster_centers.append(corpus[index]) return cluster_centers
def EM_build_and_swap(args): total_images, total_labels, sigma = load_data(args) np.random.seed(args.seed) if args.metric != "L2": raise Exception("EM does not support metrics other than L2") imgs = total_images[np.random.choice(range(len(total_images)), size=args.sample_size, replace=False)] metric = 'euclidean' kmedoids = KMedoids(n_clusters=args.num_medoids, metric=metric, random_state=None).fit(imgs) medoids = kmedoids.medoid_indices_.tolist() best_distances, closest_medoids = get_best_distances(medoids, imgs, metric='L2') loss = np.mean(best_distances) if args.verbose >= 1: print("Final results:") print(medoids) print(loss) return medoids, loss
def _cluster_matrix(self, matrix: pd.DataFrame, n_clusters=5) -> pd.DataFrame: ''' clusters rule matrix (without support,confidence, group and level columns) with Jaccard distance :param matrix: rule matrix :param n_clusters: number of clusters :return: returns clustered matrix (without support,confidence, group and level columns) with rows in one cluster next to each other * clustering was not used in the final visual design ''' tmp = matrix.copy() # remove redundant info for clustering for col in ['support', 'confidence', 'group', 'level']: if col in tmp.columns: tmp.drop(columns=col, inplace=True) # create a binary matrix df = np.where(matrix == ' ', False, True) # find clusters kmedoids_labels = KMedoids(n_clusters=n_clusters, random_state=0, metric='jaccard', init='k-medoids++') \ .fit_predict(df) labels = pd.Series(kmedoids_labels, index=tmp.index) # sort rows by clusters tmp['labels'] = labels tmp.sort_values(by='labels', inplace=True) return tmp.drop(columns=['labels'])
def fit(self, x, output_filename_suffix='output.pdf'): x = np.array(x) num_samples, num_features = x.shape[0], x.shape[1] self.__pca = PCA(n_components=min(num_samples, num_features), random_state=0) x_transformed = self.__pca.fit_transform(x) visualizer = KElbowVisualizer(KMedoids(random_state=0), k=(1, num_samples), timings=False, locate_elbow=True) visualizer.fit(x_transformed) best_n_clusters = visualizer.elbow_value_ if visualizer.elbow_value_ is not None else 1 self.__clusterer = KMedoids(n_clusters=best_n_clusters, random_state=0) self.__clusterer.fit(x_transformed)
def generate_clustering_pcoa(distance_file, biom_file, metadata_file, num_clusters, output_file=None, plot=False, L=2): if not isinstance(distance_file, list): distance_matrix = CSV.read(distance_file) else: distance_matrix = distance_file output_matrix = [] AgglomerativeCluster = AgglomerativeClustering( n_clusters=num_clusters, affinity='precomputed', linkage='complete').fit_predict(distance_matrix) KMedoidsCluster = KMedoids(n_clusters=num_clusters, metric='precomputed', method='pam', init='heuristic').fit_predict(distance_matrix) figure = pcoa.PCoA_total_from_matrix_clustering(distance_matrix, biom_file, AgglomerativeCluster, plot=plot) if output_file is not None: plt.savefig('../src/images/out_L{0}_agglomerative_pcoa.png'.format(L)) figure = pcoa.PCoA_total_from_matrix_clustering(distance_matrix, biom_file, KMedoidsCluster, plot=plot) if output_file is not None: plt.savefig('../src/images/out_L{0}_kmedoids_pcoa.png'.format(L))