def _cluster_k_means_base(self, corr: np.array) -> Dict[int, int]: """ Using KMeans clustering, group the matrix into groups of highly correlated variables. The result is a partition of the original set, that is, a collection of mutually disjoint nonempty subsets of variables. :param corr: correlation matrix :return: The optimal partition of clusters """ distance_matrix = ((1 - corr.fillna(0)) / 2.) ** .5 silhouettes = pd.Series() max_num_clusters = self.max_num_clusters if max_num_clusters is None: # if the max number of clusters wasn't specified, declare it based on corr max_num_clusters = corr.shape[0] // 2 for _ in range(self.num_clustering_trials): for i in range(2, max_num_clusters + 1): # find optimal num clusters kmeans_ = KMeans(n_clusters=i, n_jobs=1, n_init=1, random_state=42) kmeans_ = kmeans_.fit(distance_matrix) silhouettes_ = silhouette_samples(distance_matrix, kmeans_.labels_) new_calc = silhouettes_.mean() / silhouettes_.std() old_calc = silhouettes.mean() / silhouettes.std() if np.isnan(old_calc) or new_calc > old_calc: silhouettes, kmeans = silhouettes_, kmeans_ clusters = { i: corr.columns[np.where(kmeans.labels_ == i)].tolist() for i in np.unique(kmeans.labels_) } # cluster members return clusters