def hac_clustering_experiment(datasets, save_results_path=None, preprocess=None, verbose=True, return_datasets=False): affinity_names = ['l1', 'l2', 'cosine', 'canberra', 'correlation', 'rbf'] linkage = ['complete', 'average'] if return_datasets: updated_datasets = {} result = np.zeros([len(datasets), len(affinity_names), len(linkage), 3]) dataset_names = list(datasets.keys()) for i in range(len(dataset_names)): dataset_name = dataset_names[i] if verbose: print(f'\t\t\t {dataset_name}') if (preprocess is not None) and (dataset_name in preprocess): T, labels = preprocess[dataset_name](datasets[dataset_name]) T = numpy_tools.flatten_np(T) else: T = numpy_tools.flatten_np(datasets[dataset_name]['data']) labels = datasets[dataset_name]['labels'] if return_datasets: updated_datasets[dataset_name] = {'data': T, 'labels': labels} n_clusters = len(np.unique(labels)) clustAlg = AgglomerativeClustering(n_clusters=n_clusters) for k_affinity in range(len(affinity_names)): current_affinity = affinity_names[k_affinity] if verbose: print(f'\t\t Affinity: {current_affinity}') for k_linkage in range(len(linkage)): current_linkage = linkage[k_linkage] if verbose: print(f'\t Linkage: {current_linkage}') clustAlg.linkage = current_linkage if ((current_affinity == 'canberra') or (current_affinity == 'correlation') or (current_affinity == 'rbf')): clustAlg.affinity = 'precomputed' if current_affinity == 'rbf': D = pairwise_distances(T, metric='euclidean') D = -np.exp(-D) else: D = pairwise_distances(T, metric=current_affinity) pred = clustAlg.fit_predict(D) else: clustAlg.affinity = current_affinity pred = clustAlg.fit_predict(T) result[i, k_affinity, k_linkage, 0] = adjusted_rand_score(labels, pred) result[i, k_affinity, k_linkage, 1] = adjusted_mutual_info_score( labels, pred, average_method='arithmetic') result[i, k_affinity, k_linkage, 2] = fowlkes_mallows_score(labels, pred) if save_results_path is not None: np.savez_compressed(os.path.join(save_results_path), result=result, affinity_names=affinity_names, linkage=linkage, dataset_names=dataset_names, n_clusters=n_clusters) if verbose: print(f'ARI={result[i, k_affinity, k_linkage, 0]:.3f} ' f'AMI={result[i, k_affinity, k_linkage, 1]:.3f} ' f'FMI={result[i, k_affinity, k_linkage, 2]:.3f} ') if return_datasets: return result, affinity_names, linkage, dataset_names, n_clusters, updated_datasets return result, affinity_names, linkage, dataset_names, n_clusters